From 245141e00a6fe2e72da701a72be6e72ad116154f Mon Sep 17 00:00:00 2001 From: Jin Shang Date: Tue, 18 Jul 2023 21:46:42 +0800 Subject: [PATCH 001/749] GH-35942: [C++] Improve Decimal ToReal accuracy (#36667) ### Rationale for this change The current implementation of `Decimal::ToReal` can be naively represented as the following pseudocode: ``` Real v = static_cast(decimal.as_int128/256()) return v * (10.0**-scale) ``` It stores the intermediate unscaled int128/256 value as a float/double. The unscaled int128/256 value can be very large when the decimal has a large scale, which causes precision issues such as in #36602. ### What changes are included in this PR? Avoid storing the unscaled large int as float if the representation is not precise, by spliting the decimal into integral and fractional parts and dealing with them separately. This algorithm guarantees that: 1. If the decimal is an integer, the conversion is exact. 2. If the number of fractional digits is <= RealTraits::kMantissaDigits (e.g. 8 for float and 16 for double), the conversion is within 1 ULP of the exact value. For example Decimal128::ToReal(9999.999) falls into this category because the integer 9999999 is precisely representable by float, whereas 9999.9999 would be in the next category. 3. Otherwise, the conversion is within 2^(-RealTraits::kMantissaDigits+1) (e.g. 2^-23 for float and 2^-52 for double) of the exact value. Here "exact value" means the closest representable value by Real. I believe this algorithm is good enough, because an"exact" algorithm would require iterative multiplication and subtraction of decimals to determain the binary representation of its fractional part. Yet the result would still almost always be inaccurate because float/double can only accurately represent powers of two. IMHO It's not worth it to spend that many expensive operations just to improve the result by one ULP. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * Closes: #35942 Lead-authored-by: Jin Shang Co-authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- .../arrow/compute/kernels/scalar_cast_test.cc | 3 +- cpp/src/arrow/util/basic_decimal.cc | 10 ++ cpp/src/arrow/util/basic_decimal.h | 4 + cpp/src/arrow/util/decimal.cc | 58 ++++++- cpp/src/arrow/util/decimal_internal.h | 4 + cpp/src/arrow/util/decimal_test.cc | 148 +++++++++++++----- 6 files changed, 183 insertions(+), 44 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 083a85eb346c5..1db06a762544b 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -1025,7 +1025,8 @@ TEST(Cast, DecimalToFloating) { } } - // Edge cases are tested for Decimal128::ToReal() and Decimal256::ToReal() + // Edge cases are tested for Decimal128::ToReal() and Decimal256::ToReal() in + // decimal_test.cc } TEST(Cast, DecimalToString) { diff --git a/cpp/src/arrow/util/basic_decimal.cc b/cpp/src/arrow/util/basic_decimal.cc index f2fd39d6f37ad..0835ab9074a48 100644 --- a/cpp/src/arrow/util/basic_decimal.cc +++ b/cpp/src/arrow/util/basic_decimal.cc @@ -969,6 +969,16 @@ bool BasicDecimal256::FitsInPrecision(int32_t precision) const { return BasicDecimal256::Abs(*this) < kDecimal256PowersOfTen[precision]; } +void BasicDecimal256::GetWholeAndFraction(int scale, BasicDecimal256* whole, + BasicDecimal256* fraction) const { + DCHECK_GE(scale, 0); + DCHECK_LE(scale, 76); + + BasicDecimal256 multiplier(kDecimal256PowersOfTen[scale]); + auto s = Divide(multiplier, whole, fraction); + DCHECK_EQ(s, DecimalStatus::kSuccess); +} + const BasicDecimal256& BasicDecimal256::GetScaleMultiplier(int32_t scale) { DCHECK_GE(scale, 0); DCHECK_LE(scale, 76); diff --git a/cpp/src/arrow/util/basic_decimal.h b/cpp/src/arrow/util/basic_decimal.h index b263bb234a795..d8a91ea76b390 100644 --- a/cpp/src/arrow/util/basic_decimal.h +++ b/cpp/src/arrow/util/basic_decimal.h @@ -366,6 +366,10 @@ class ARROW_EXPORT BasicDecimal256 : public GenericBasicDecimal - static Real ToRealPositive(const Decimal128& decimal, int32_t scale) { + static Real ToRealPositiveNoSplit(const Decimal128& decimal, int32_t scale) { Real x = RealTraits::two_to_64(static_cast(decimal.high_bits())); x += static_cast(decimal.low_bits()); x *= LargePowerOfTen(-scale); return x; } + + /// An appoximate conversion from Decimal128 to Real that guarantees: + /// 1. If the decimal is an integer, the conversion is exact. + /// 2. If the number of fractional digits is <= RealTraits::kMantissaDigits (e.g. + /// 8 for float and 16 for double), the conversion is within 1 ULP of the exact + /// value. + /// 3. Otherwise, the conversion is within 2^(-RealTraits::kMantissaDigits+1) + /// (e.g. 2^-23 for float and 2^-52 for double) of the exact value. + /// Here "exact value" means the closest representable value by Real. + template + static Real ToRealPositive(const Decimal128& decimal, int32_t scale) { + if (scale <= 0 || (decimal.high_bits() == 0 && + decimal.low_bits() <= RealTraits::kMaxPreciseInteger)) { + // No need to split the decimal if it is already an integer (scale <= 0) or if it + // can be precisely represented by Real + return ToRealPositiveNoSplit(decimal, scale); + } + + // Split decimal into whole and fractional parts to avoid precision loss + BasicDecimal128 whole_decimal, fraction_decimal; + decimal.GetWholeAndFraction(scale, &whole_decimal, &fraction_decimal); + + Real whole = ToRealPositiveNoSplit(whole_decimal, 0); + Real fraction = ToRealPositiveNoSplit(fraction_decimal, scale); + + return whole + fraction; + } }; } // namespace @@ -967,7 +994,7 @@ struct Decimal256RealConversion } template - static Real ToRealPositive(const Decimal256& decimal, int32_t scale) { + static Real ToRealPositiveNoSplit(const Decimal256& decimal, int32_t scale) { DCHECK_GE(decimal, 0); Real x = 0; const auto parts_le = bit_util::little_endian::Make(decimal.native_endian_array()); @@ -978,6 +1005,33 @@ struct Decimal256RealConversion x *= LargePowerOfTen(-scale); return x; } + + /// An appoximate conversion from Decimal256 to Real that guarantees: + /// 1. If the decimal is an integer, the conversion is exact. + /// 2. If the number of fractional digits is <= RealTraits::kMantissaDigits (e.g. + /// 8 for float and 16 for double), the conversion is within 1 ULP of the exact + /// value. + /// 3. Otherwise, the conversion is within 2^(-RealTraits::kMantissaDigits+1) + /// (e.g. 2^-23 for float and 2^-52 for double) of the exact value. + /// Here "exact value" means the closest representable value by Real. + template + static Real ToRealPositive(const Decimal256& decimal, int32_t scale) { + const auto parts_le = decimal.little_endian_array(); + if (scale <= 0 || (parts_le[3] == 0 && parts_le[2] == 0 && parts_le[1] == 0 && + parts_le[0] < RealTraits::kMaxPreciseInteger)) { + // No need to split the decimal if it is already an integer (scale <= 0) or if it + // can be precisely represented by Real + return ToRealPositiveNoSplit(decimal, scale); + } + + // Split the decimal into whole and fractional parts to avoid precision loss + BasicDecimal256 whole_decimal, fraction_decimal; + decimal.GetWholeAndFraction(scale, &whole_decimal, &fraction_decimal); + + Real whole = ToRealPositiveNoSplit(whole_decimal, 0); + Real fraction = ToRealPositiveNoSplit(fraction_decimal, scale); + return whole + fraction; + } }; } // namespace diff --git a/cpp/src/arrow/util/decimal_internal.h b/cpp/src/arrow/util/decimal_internal.h index 041aac4ef860d..51a7229ab6678 100644 --- a/cpp/src/arrow/util/decimal_internal.h +++ b/cpp/src/arrow/util/decimal_internal.h @@ -451,6 +451,8 @@ struct RealTraits { static constexpr int kMantissaBits = 24; // ceil(log10(2 ^ kMantissaBits)) static constexpr int kMantissaDigits = 8; + // Integers between zero and kMaxPreciseInteger can be precisely represented + static constexpr uint64_t kMaxPreciseInteger = (1ULL << kMantissaBits) - 1; }; template <> @@ -464,6 +466,8 @@ struct RealTraits { static constexpr int kMantissaBits = 53; // ceil(log10(2 ^ kMantissaBits)) static constexpr int kMantissaDigits = 16; + // Integers between zero and kMaxPreciseInteger can be precisely represented + static constexpr uint64_t kMaxPreciseInteger = (1ULL << kMantissaBits) - 1; }; template diff --git a/cpp/src/arrow/util/decimal_test.cc b/cpp/src/arrow/util/decimal_test.cc index 1401750ce76d6..6376a9545a0f8 100644 --- a/cpp/src/arrow/util/decimal_test.cc +++ b/cpp/src/arrow/util/decimal_test.cc @@ -1050,6 +1050,24 @@ void CheckDecimalToReal(const std::string& decimal_value, int32_t scale, Real ex << "Decimal value: " << decimal_value << " Scale: " << scale; } +template +void CheckDecimalToRealWithinOneULP(const std::string& decimal_value, int32_t scale, + Real expected) { + Decimal dec(decimal_value); + auto result = dec.template ToReal(scale); + ASSERT_TRUE(result == expected || result == std::nextafter(expected, expected + 1) || + result == std::nextafter(expected, expected - 1)) + << "Decimal value: " << decimal_value << " Scale: " << scale; +} + +template +void CheckDecimalToRealWithinEpsilon(const std::string& decimal_value, int32_t scale, + Real epsilon, Real expected) { + Decimal dec(decimal_value); + ASSERT_TRUE(std::abs(dec.template ToReal(scale) - expected) <= epsilon) + << "Decimal value: " << decimal_value << " Scale: " << scale; +} + template void CheckDecimalToRealApprox(const std::string& decimal_value, int32_t scale, float expected) { @@ -1110,59 +1128,79 @@ class TestDecimalToReal : public ::testing::Test { } } } +}; - // Test precision of conversions to float values - void TestPrecision() { - // 2**63 + 2**40 (exactly representable in a float's 24 bits of precision) - CheckDecimalToReal("9223373136366403584", 0, 9.223373e+18f); - CheckDecimalToReal("-9223373136366403584", 0, -9.223373e+18f); - // 2**64 + 2**41 (exactly representable in a float) - CheckDecimalToReal("18446746272732807168", 0, 1.8446746e+19f); - CheckDecimalToReal("-18446746272732807168", 0, -1.8446746e+19f); - } +TYPED_TEST_SUITE(TestDecimalToReal, RealTypes); +TYPED_TEST(TestDecimalToReal, TestSuccess) { this->TestSuccess(); } + +// Custom test for Decimal::ToReal +template +class TestDecimalToRealFloat : public TestDecimalToReal> {}; +TYPED_TEST_SUITE(TestDecimalToRealFloat, DecimalTypes); - // Test conversions with a range of scales - void TestLargeValues(int32_t max_scale) { - // Note that exact comparisons would succeed on some platforms (Linux, macOS). - // Nevertheless, power-of-ten factors are not all exactly representable - // in binary floating point. - for (int32_t scale = -max_scale; scale <= max_scale; scale++) { +TYPED_TEST(TestDecimalToRealFloat, LargeValues) { + auto max_scale = TypeParam::kMaxScale; + // Note that exact comparisons would succeed on some platforms (Linux, macOS). + // Nevertheless, power-of-ten factors are not all exactly representable + // in binary floating point. + for (int32_t scale = -max_scale; scale <= max_scale; scale++) { #ifdef _WIN32 - // MSVC gives pow(10.f, -45.f) == 0 even though 1e-45f is nonzero - if (scale == 45) continue; + // MSVC gives pow(10.f, -45.f) == 0 even though 1e-45f is nonzero + if (scale == 45) continue; #endif - CheckDecimalToRealApprox("1", scale, Pow10(-scale)); - } - for (int32_t scale = -max_scale; scale <= max_scale - 2; scale++) { + CheckDecimalToRealApprox("1", scale, this->Pow10(-scale)); + } + for (int32_t scale = -max_scale; scale <= max_scale - 2; scale++) { #ifdef _WIN32 - // MSVC gives pow(10.f, -45.f) == 0 even though 1e-45f is nonzero - if (scale == 45) continue; + // MSVC gives pow(10.f, -45.f) == 0 even though 1e-45f is nonzero + if (scale == 45) continue; #endif - const Real factor = static_cast(123); - CheckDecimalToRealApprox("123", scale, factor * Pow10(-scale)); - } + const auto factor = static_cast(123); + CheckDecimalToRealApprox("123", scale, factor * this->Pow10(-scale)); } -}; - -TYPED_TEST_SUITE(TestDecimalToReal, RealTypes); - -TYPED_TEST(TestDecimalToReal, TestSuccess) { this->TestSuccess(); } +} -// Custom test for Decimal128::ToReal -class TestDecimal128ToRealFloat : public TestDecimalToReal> { -}; -TEST_F(TestDecimal128ToRealFloat, LargeValues) { TestLargeValues(/*max_scale=*/38); } -TEST_F(TestDecimal128ToRealFloat, Precision) { this->TestPrecision(); } -// Custom test for Decimal256::ToReal -class TestDecimal256ToRealFloat : public TestDecimalToReal> { -}; -TEST_F(TestDecimal256ToRealFloat, LargeValues) { TestLargeValues(/*max_scale=*/76); } -TEST_F(TestDecimal256ToRealFloat, Precision) { this->TestPrecision(); } +TYPED_TEST(TestDecimalToRealFloat, Precision) { + // 2**63 + 2**40 (exactly representable in a float's 24 bits of precision) + CheckDecimalToReal("9223373136366403584", 0, 9.223373e+18f); + CheckDecimalToReal("-9223373136366403584", 0, -9.223373e+18f); + // 2**64 + 2**41 (exactly representable in a float) + CheckDecimalToReal("18446746272732807168", 0, 1.8446746e+19f); + CheckDecimalToReal("-18446746272732807168", 0, -1.8446746e+19f); + + // Integers are always exact + auto scale = TypeParam::kMaxScale - 1; + std::string seven = "7."; + seven.append(scale, '0'); // pad with trailing zeros + CheckDecimalToReal(seven, scale, 7.0f); + CheckDecimalToReal("-" + seven, scale, -7.0f); + + CheckDecimalToReal("99999999999999999999.0000000000000000", 16, + 99999999999999999999.0f); + CheckDecimalToReal("-99999999999999999999.0000000000000000", 16, + -99999999999999999999.0f); + + // Small fractions are within one ULP + CheckDecimalToRealWithinOneULP("9999999.9", 1, 9999999.9f); + CheckDecimalToRealWithinOneULP("-9999999.9", 1, -9999999.9f); + CheckDecimalToRealWithinOneULP("9999999.999999", 6, 9999999.999999f); + CheckDecimalToRealWithinOneULP("-9999999.999999", 6, + -9999999.999999f); + + // Large fractions are within 2^-23 + constexpr float epsilon = 1.1920928955078125e-07f; // 2^-23 + CheckDecimalToRealWithinEpsilon( + "112334829348925.99070703983306884765625", 23, epsilon, + 112334829348925.99070703983306884765625f); + CheckDecimalToRealWithinEpsilon( + "1.987748987892758765582589910934859345", 36, epsilon, + 1.987748987892758765582589910934859345f); +} // ToReal tests are disabled on MinGW because of precision issues in results #ifndef __MINGW32__ -// Custom test for Decimal128::ToReal +// Custom test for Decimal::ToReal template class TestDecimalToRealDouble : public TestDecimalToReal> { }; @@ -1209,6 +1247,34 @@ TYPED_TEST(TestDecimalToRealDouble, Precision) { 9.999999999999998e+47); CheckDecimalToReal("-99999999999999978859343891977453174784", -10, -9.999999999999998e+47); + // Integers are always exact + auto scale = TypeParam::kMaxScale - 1; + std::string seven = "7."; + seven.append(scale, '0'); + CheckDecimalToReal(seven, scale, 7.0); + CheckDecimalToReal("-" + seven, scale, -7.0); + + CheckDecimalToReal("99999999999999999999.0000000000000000", 16, + 99999999999999999999.0); + CheckDecimalToReal("-99999999999999999999.0000000000000000", 16, + -99999999999999999999.0); + + // Small fractions are within one ULP + CheckDecimalToRealWithinOneULP("9999999.9", 1, 9999999.9); + CheckDecimalToRealWithinOneULP("-9999999.9", 1, -9999999.9); + CheckDecimalToRealWithinOneULP("9999999.999999999999999", 15, + 9999999.999999999999999); + CheckDecimalToRealWithinOneULP("-9999999.999999999999999", 15, + -9999999.999999999999999); + + // Large fractions are within 2^-52 + constexpr double epsilon = 2.220446049250313080847263336181640625e-16; // 2^-52 + CheckDecimalToRealWithinEpsilon( + "112334829348925.99070703983306884765625", 23, epsilon, + 112334829348925.99070703983306884765625); + CheckDecimalToRealWithinEpsilon( + "1.987748987892758765582589910934859345", 36, epsilon, + 1.987748987892758765582589910934859345); } #endif // __MINGW32__ From de8df23a8cd9737b4df5bb1b68fc12a54f252d0d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 18 Jul 2023 16:41:55 +0200 Subject: [PATCH 002/749] GH-36744: [Python][Packaging] Add upper pin for cython<3 to pyarrow build dependencies (#36743) ### Rationale for this change Although we already fixed some cython 3 build issues (https://github.com/apache/arrow/pull/34726), some new have been introduced, which we are seeing now cython 3 is released (https://github.com/apache/arrow/issues/36730) Adding an upper pin (<3) for the release, so we have more time (the full 14.0 release cycle) to iron out issues. * Closes: #36744 Authored-by: Joris Van den Bossche Signed-off-by: Antoine Pitrou --- .github/workflows/dev.yml | 2 +- ci/conda_env_python.txt | 2 +- python/pyproject.toml | 2 +- python/requirements-build.txt | 2 +- python/requirements-wheel-build.txt | 2 +- python/setup.py | 7 ++++--- 6 files changed, 9 insertions(+), 8 deletions(-) diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 7c2437f6edfb5..119d11d9a399a 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -103,7 +103,7 @@ jobs: shell: bash run: | gem install test-unit - pip install cython setuptools six pytest jira + pip install "cython<3" setuptools six pytest jira - name: Run Release Test env: ARROW_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/ci/conda_env_python.txt b/ci/conda_env_python.txt index 04f985c94bb2c..4ae5c3614a1dc 100644 --- a/ci/conda_env_python.txt +++ b/ci/conda_env_python.txt @@ -18,7 +18,7 @@ # don't add pandas here, because it is not a mandatory test dependency boto3 # not a direct dependency of s3fs, but needed for our s3fs fixture cffi -cython +cython<3 cloudpickle fsspec hypothesis diff --git a/python/pyproject.toml b/python/pyproject.toml index fe8c938a9ce4f..7e61304585809 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -17,7 +17,7 @@ [build-system] requires = [ - "cython >= 0.29.31", + "cython >= 0.29.31,<3", "oldest-supported-numpy>=0.14", "setuptools_scm", "setuptools >= 40.1.0", diff --git a/python/requirements-build.txt b/python/requirements-build.txt index 507e9081373e2..6378d1b94e1bb 100644 --- a/python/requirements-build.txt +++ b/python/requirements-build.txt @@ -1,4 +1,4 @@ -cython>=0.29.31 +cython>=0.29.31,<3 oldest-supported-numpy>=0.14 setuptools_scm setuptools>=38.6.0 diff --git a/python/requirements-wheel-build.txt b/python/requirements-wheel-build.txt index 6043d2ffb2c6e..e4f5243fbc2fe 100644 --- a/python/requirements-wheel-build.txt +++ b/python/requirements-wheel-build.txt @@ -1,4 +1,4 @@ -cython>=0.29.31 +cython>=0.29.31,<3 oldest-supported-numpy>=0.14 setuptools_scm setuptools>=58 diff --git a/python/setup.py b/python/setup.py index f06cb5a627562..dc529679c7f90 100755 --- a/python/setup.py +++ b/python/setup.py @@ -40,8 +40,9 @@ # Check if we're running 64-bit Python is_64_bit = sys.maxsize > 2**32 -if Cython.__version__ < '0.29.31': - raise Exception('Please upgrade to Cython 0.29.31 or newer') +if Cython.__version__ < '0.29.31' or Cython.__version__ >= '3.0': + raise Exception( + 'Please update your Cython version. Supported Cython >= 0.29.31, < 3.0') setup_dir = os.path.abspath(os.path.dirname(__file__)) @@ -491,7 +492,7 @@ def has_ext_modules(foo): 'pyarrow/_generated_version.py'), 'version_scheme': guess_next_dev_version }, - setup_requires=['setuptools_scm', 'cython >= 0.29.31'] + setup_requires, + setup_requires=['setuptools_scm', 'cython >= 0.29.31,<3'] + setup_requires, install_requires=install_requires, tests_require=['pytest', 'pandas', 'hypothesis'], python_requires='>=3.8', From f9904063b163c4ad44bef61e84a1e4a90b600d34 Mon Sep 17 00:00:00 2001 From: mwish Date: Wed, 19 Jul 2023 00:32:34 +0800 Subject: [PATCH 003/749] GH-35934:[C++][Parquet] PageIndex Read benchmark (#36702) ### Rationale for this change Add benchmark for read page index ### What changes are included in this PR? Just a benchmark in `cpp/src/parquet/bloom_filter_benchmark.cc` ### Are these changes tested? No ### Are there any user-facing changes? No * Closes: #35934 Lead-authored-by: mwish Co-authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/arrow/dataset/dataset.h | 2 +- cpp/src/parquet/CMakeLists.txt | 5 +- cpp/src/parquet/benchmark_util.cc | 126 ++++++++++++++++++ cpp/src/parquet/benchmark_util.h | 47 +++++++ cpp/src/parquet/bloom_filter_benchmark.cc | 69 ++-------- cpp/src/parquet/level_conversion_benchmark.cc | 2 +- cpp/src/parquet/page_index_benchmark.cc | 107 +++++++++++++++ cpp/src/parquet/test_util.h | 2 +- 8 files changed, 296 insertions(+), 64 deletions(-) create mode 100644 cpp/src/parquet/benchmark_util.cc create mode 100644 cpp/src/parquet/benchmark_util.h create mode 100644 cpp/src/parquet/page_index_benchmark.cc diff --git a/cpp/src/arrow/dataset/dataset.h b/cpp/src/arrow/dataset/dataset.h index 1db230b16e9c2..39936fbd7b5b2 100644 --- a/cpp/src/arrow/dataset/dataset.h +++ b/cpp/src/arrow/dataset/dataset.h @@ -82,7 +82,7 @@ class ARROW_DS_EXPORT FragmentSelection { /// \brief Instructions for scanning a particular fragment /// -/// The fragment scan request is dervied from ScanV2Options. The main +/// The fragment scan request is derived from ScanV2Options. The main /// difference is that the scan options are based on the dataset schema /// while the fragment request is based on the fragment schema. struct ARROW_DS_EXPORT FragmentScanRequest { diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index e6aad7cee2a3e..eb2e2d8fed88f 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -401,11 +401,14 @@ endif() add_parquet_test(file_deserialize_test SOURCES file_deserialize_test.cc test_util.cc) add_parquet_test(schema_test) -add_parquet_benchmark(bloom_filter_benchmark) +add_parquet_benchmark(bloom_filter_benchmark SOURCES bloom_filter_benchmark.cc + benchmark_util.cc) add_parquet_benchmark(column_reader_benchmark) add_parquet_benchmark(column_io_benchmark) add_parquet_benchmark(encoding_benchmark) add_parquet_benchmark(level_conversion_benchmark) +add_parquet_benchmark(page_index_benchmark SOURCES page_index_benchmark.cc + benchmark_util.cc) add_parquet_benchmark(arrow/reader_writer_benchmark PREFIX "parquet-arrow") if(ARROW_WITH_BROTLI) diff --git a/cpp/src/parquet/benchmark_util.cc b/cpp/src/parquet/benchmark_util.cc new file mode 100644 index 0000000000000..6220336e1c39e --- /dev/null +++ b/cpp/src/parquet/benchmark_util.cc @@ -0,0 +1,126 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/benchmark_util.h" + +#include + +namespace parquet::benchmark { + +namespace { + +void GenerateRandomString(uint32_t length, uint32_t seed, std::vector* heap) { + // Character set used to generate random string + const std::string charset = + "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + + std::default_random_engine gen(seed); + std::uniform_int_distribution dist(0, static_cast(charset.size() - 1)); + + for (uint32_t i = 0; i < length; i++) { + heap->emplace_back(charset[dist(gen)]); + } +} + +template +void GenerateBenchmarkDataIntegerImpl(uint32_t size, uint32_t seed, T* data, + std::vector* heap, uint32_t) { + static_assert(std::is_integral_v); + heap->clear(); + std::default_random_engine gen(seed); + std::uniform_int_distribution d(std::numeric_limits::min(), + std::numeric_limits::max()); + for (uint32_t i = 0; i < size; ++i) { + data[i] = d(gen); + } +} + +template +void GenerateBenchmarkDataFloatImpl(uint32_t size, uint32_t seed, T* data, + std::vector* heap, uint32_t) { + static_assert(std::is_floating_point_v); + heap->clear(); + std::default_random_engine gen(seed); + std::uniform_real_distribution d(std::numeric_limits::lowest(), + std::numeric_limits::max()); + for (uint32_t i = 0; i < size; ++i) { + data[i] = d(gen); + } +} + +} // namespace + +template <> +void GenerateBenchmarkData(uint32_t size, uint32_t seed, int32_t* data, + std::vector* heap, uint32_t data_string_length) { + GenerateBenchmarkDataIntegerImpl(size, seed, data, heap, data_string_length); +} + +template <> +void GenerateBenchmarkData(uint32_t size, uint32_t seed, int64_t* data, + std::vector* heap, uint32_t data_string_length) { + GenerateBenchmarkDataIntegerImpl(size, seed, data, heap, data_string_length); +} + +template <> +void GenerateBenchmarkData(uint32_t size, uint32_t seed, float* data, + std::vector* heap, uint32_t data_string_length) { + GenerateBenchmarkDataFloatImpl(size, seed, data, heap, data_string_length); +} + +template <> +void GenerateBenchmarkData(uint32_t size, uint32_t seed, double* data, + std::vector* heap, uint32_t data_string_length) { + GenerateBenchmarkDataFloatImpl(size, seed, data, heap, data_string_length); +} + +template <> +void GenerateBenchmarkData(uint32_t size, uint32_t seed, Int96* data, + std::vector* heap, uint32_t) { + heap->clear(); + std::default_random_engine gen(seed); + std::uniform_int_distribution d(std::numeric_limits::min(), + std::numeric_limits::max()); + for (uint32_t i = 0; i < size; ++i) { + data[i].value[0] = d(gen); + data[i].value[1] = d(gen); + data[i].value[2] = d(gen); + } +} + +template <> +void GenerateBenchmarkData(uint32_t size, uint32_t seed, FLBA* data, + std::vector* heap, uint32_t data_string_length) { + heap->clear(); + GenerateRandomString(data_string_length * size, seed, heap); + for (uint32_t i = 0; i < size; ++i) { + data[i].ptr = heap->data() + i * data_string_length; + } +} + +template <> +void GenerateBenchmarkData(uint32_t size, uint32_t seed, ByteArray* data, + std::vector* heap, uint32_t data_string_length) { + heap->clear(); + GenerateRandomString(data_string_length * size, seed, heap); + for (uint32_t i = 0; i < size; ++i) { + data[i].ptr = heap->data() + i * data_string_length; + data[i].len = data_string_length; + } +} + +} // namespace parquet::benchmark diff --git a/cpp/src/parquet/benchmark_util.h b/cpp/src/parquet/benchmark_util.h new file mode 100644 index 0000000000000..7996f7f85e898 --- /dev/null +++ b/cpp/src/parquet/benchmark_util.h @@ -0,0 +1,47 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "parquet/types.h" + +namespace parquet::benchmark { + +template +void GenerateBenchmarkData(uint32_t size, uint32_t seed, T* data, + std::vector* heap, uint32_t data_string_length); + +#define _GENERATE_BENCHMARK_DATA_DECL(KLASS) \ + template <> \ + void GenerateBenchmarkData(uint32_t size, uint32_t seed, KLASS* data, \ + std::vector* heap, uint32_t data_string_length); + +_GENERATE_BENCHMARK_DATA_DECL(int32_t) +_GENERATE_BENCHMARK_DATA_DECL(int64_t) +_GENERATE_BENCHMARK_DATA_DECL(float) +_GENERATE_BENCHMARK_DATA_DECL(double) +_GENERATE_BENCHMARK_DATA_DECL(ByteArray) +_GENERATE_BENCHMARK_DATA_DECL(FLBA) +_GENERATE_BENCHMARK_DATA_DECL(Int96) + +#undef _GENERATE_BENCHMARK_DATA_DECL + +} // namespace parquet::benchmark diff --git a/cpp/src/parquet/bloom_filter_benchmark.cc b/cpp/src/parquet/bloom_filter_benchmark.cc index fa934b1d5290a..13c731d975b2c 100644 --- a/cpp/src/parquet/bloom_filter_benchmark.cc +++ b/cpp/src/parquet/bloom_filter_benchmark.cc @@ -18,13 +18,13 @@ #include "benchmark/benchmark.h" #include "arrow/util/logging.h" +#include "parquet/benchmark_util.h" #include "parquet/bloom_filter.h" #include "parquet/properties.h" #include -namespace parquet { -namespace benchmark { +namespace parquet::benchmark { constexpr static uint32_t kNumBloomFilterInserts = 16 * 1024; // The sample string length for FLBA and ByteArray benchmarks @@ -40,63 +40,11 @@ std::unique_ptr CreateBloomFilter(uint32_t num_values) { return bloom_filter; } -void GenerateRandomString(uint32_t length, uint32_t seed, std::vector* heap) { - // Character set used to generate random string - const std::string charset = - "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; - - std::default_random_engine gen(seed); - std::uniform_int_distribution dist(0, static_cast(charset.size() - 1)); - - for (uint32_t i = 0; i < length; i++) { - heap->push_back(charset[dist(gen)]); - } -} - -template -void GenerateBenchmarkData(uint32_t size, uint32_t seed, T* data, - [[maybe_unused]] std::vector* heap = nullptr) { - if constexpr (std::is_integral_v) { - std::default_random_engine gen(seed); - std::uniform_int_distribution d(std::numeric_limits::min(), - std::numeric_limits::max()); - for (uint32_t i = 0; i < size; ++i) { - data[i] = d(gen); - } - } else if constexpr (std::is_floating_point_v) { - std::default_random_engine gen(seed); - std::uniform_real_distribution d(std::numeric_limits::lowest(), - std::numeric_limits::max()); - for (uint32_t i = 0; i < size; ++i) { - data[i] = d(gen); - } - } else if constexpr (std::is_same_v) { - GenerateRandomString(kDataStringLength * size, seed, heap); - for (uint32_t i = 0; i < size; ++i) { - data[i].ptr = heap->data() + i * kDataStringLength; - } - } else if constexpr (std::is_same_v) { - GenerateRandomString(kDataStringLength * size, seed, heap); - for (uint32_t i = 0; i < size; ++i) { - data[i].ptr = heap->data() + i * kDataStringLength; - data[i].len = kDataStringLength; - } - } else if constexpr (std::is_same_v) { - std::default_random_engine gen(seed); - std::uniform_int_distribution d(std::numeric_limits::min(), - std::numeric_limits::max()); - for (uint32_t i = 0; i < size; ++i) { - data[i].value[0] = d(gen); - data[i].value[1] = d(gen); - data[i].value[2] = d(gen); - } - } -} - std::vector GetHashValues(uint32_t num_values, uint32_t seed) { // Generate sample data values std::vector values(num_values); - GenerateBenchmarkData(num_values, seed, values.data()); + std::vector heap; + GenerateBenchmarkData(num_values, seed, values.data(), &heap, kDataStringLength); // Create a temp filter to compute hash values auto filter = CreateBloomFilter(/*num_values=*/8); std::vector hashes(num_values); @@ -109,7 +57,8 @@ static void BM_ComputeHash(::benchmark::State& state) { using T = typename DType::c_type; std::vector values(kNumBloomFilterInserts); std::vector heap; - GenerateBenchmarkData(kNumBloomFilterInserts, /*seed=*/0, values.data(), &heap); + GenerateBenchmarkData(kNumBloomFilterInserts, /*seed=*/0, values.data(), &heap, + kDataStringLength); auto filter = CreateBloomFilter(kNumBloomFilterInserts); for (auto _ : state) { uint64_t total = 0; @@ -136,7 +85,8 @@ static void BM_BatchComputeHash(::benchmark::State& state) { using T = typename DType::c_type; std::vector values(kNumBloomFilterInserts); std::vector heap; - GenerateBenchmarkData(kNumBloomFilterInserts, /*seed=*/0, values.data(), &heap); + GenerateBenchmarkData(kNumBloomFilterInserts, /*seed=*/0, values.data(), &heap, + kDataStringLength); auto filter = CreateBloomFilter(kNumBloomFilterInserts); std::vector hashes(kNumBloomFilterInserts); for (auto _ : state) { @@ -231,5 +181,4 @@ BENCHMARK(BM_BatchInsertHash); BENCHMARK(BM_FindExistingHash); BENCHMARK(BM_FindNonExistingHash); -} // namespace benchmark -} // namespace parquet +} // namespace parquet::benchmark diff --git a/cpp/src/parquet/level_conversion_benchmark.cc b/cpp/src/parquet/level_conversion_benchmark.cc index f9e91c4820f68..f3a4f8095e3a1 100644 --- a/cpp/src/parquet/level_conversion_benchmark.cc +++ b/cpp/src/parquet/level_conversion_benchmark.cc @@ -29,7 +29,7 @@ constexpr int16_t kMissingDefLevel = 0; // Definition Level indicating the values has an entry in the leaf element. constexpr int16_t kPresentDefLevel = 2; -// A repition level that indicates a repeated element. +// A repetition level that indicates a repeated element. constexpr int16_t kHasRepeatedElements = 1; std::vector RunDefinitionLevelsToBitmap(const std::vector& def_levels, diff --git a/cpp/src/parquet/page_index_benchmark.cc b/cpp/src/parquet/page_index_benchmark.cc new file mode 100644 index 0000000000000..5631034105056 --- /dev/null +++ b/cpp/src/parquet/page_index_benchmark.cc @@ -0,0 +1,107 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "benchmark/benchmark.h" + +#include "parquet/benchmark_util.h" +#include "parquet/metadata.h" +#include "parquet/page_index.h" +#include "parquet/schema.h" +#include "parquet/test_util.h" +#include "parquet/thrift_internal.h" + +namespace parquet::benchmark { + +void PageIndexSetArgs(::benchmark::internal::Benchmark* bench) { + bench->ArgNames({"num_pages"}); + bench->Range(8, 1024); +} + +void BM_ReadOffsetIndex(::benchmark::State& state) { + auto builder = OffsetIndexBuilder::Make(); + const int num_pages = static_cast(state.range(0)); + constexpr int64_t page_size = 1024; + constexpr int64_t first_row_index = 10000; + for (int i = 0; i < num_pages; ++i) { + builder->AddPage(page_size * i, page_size, first_row_index * i); + } + constexpr int64_t final_position = 4096; + builder->Finish(final_position); + auto sink = CreateOutputStream(); + builder->WriteTo(sink.get()); + auto buffer = sink->Finish().ValueOrDie(); + ReaderProperties properties; + for (auto _ : state) { + auto offset_index = OffsetIndex::Make( + buffer->data() + 0, static_cast(buffer->size()), properties); + ::benchmark::DoNotOptimize(offset_index); + } + state.SetBytesProcessed(state.iterations() * buffer->size()); + state.SetItemsProcessed(state.iterations() * num_pages); +} + +BENCHMARK(BM_ReadOffsetIndex)->Apply(PageIndexSetArgs); + +// The sample string length for FLBA and ByteArray benchmarks +constexpr static uint32_t kDataStringLength = 8; + +template +void BM_ReadColumnIndex(::benchmark::State& state) { + schema::NodePtr type = ::parquet::schema::PrimitiveNode::Make( + "b", Repetition::OPTIONAL, DType::type_num, ConvertedType::NONE, 8); + auto descr_ptr = + std::make_unique(type, /*def_level=*/1, /*rep_level=*/0); + auto descr = descr_ptr.get(); + + const int num_pages = static_cast(state.range(0)); + auto builder = ColumnIndexBuilder::Make(descr); + + const size_t values_per_page = 100; + for (int i = 0; i < num_pages; ++i) { + auto stats = MakeStatistics(descr); + std::vector heap; + std::vector values; + values.resize(values_per_page); + GenerateBenchmarkData(values_per_page, /*seed=*/0, values.data(), &heap, + kDataStringLength); + stats->Update(values.data(), values_per_page, /*null_count=*/0); + builder->AddPage(stats->Encode()); + } + + builder->Finish(); + auto sink = CreateOutputStream(); + builder->WriteTo(sink.get()); + auto buffer = sink->Finish().ValueOrDie(); + ReaderProperties properties; + for (auto _ : state) { + auto column_index = ColumnIndex::Make(*descr, buffer->data() + 0, + static_cast(buffer->size()), properties); + ::benchmark::DoNotOptimize(column_index); + } + state.SetBytesProcessed(state.iterations() * buffer->size()); + state.SetItemsProcessed(state.iterations() * num_pages); +} + +BENCHMARK_TEMPLATE(BM_ReadColumnIndex, Int64Type)->Apply(PageIndexSetArgs); +BENCHMARK_TEMPLATE(BM_ReadColumnIndex, DoubleType)->Apply(PageIndexSetArgs); +BENCHMARK_TEMPLATE(BM_ReadColumnIndex, FLBAType)->Apply(PageIndexSetArgs); +BENCHMARK_TEMPLATE(BM_ReadColumnIndex, ByteArrayType)->Apply(PageIndexSetArgs); + +} // namespace parquet::benchmark diff --git a/cpp/src/parquet/test_util.h b/cpp/src/parquet/test_util.h index dfb4b5d0fbf4a..b0aafa037ead1 100644 --- a/cpp/src/parquet/test_util.h +++ b/cpp/src/parquet/test_util.h @@ -556,7 +556,7 @@ static inline int MakePages(const ColumnDescriptor* d, int num_pages, int levels } else { num_values = num_levels; } - // Create repitition levels + // Create repetition levels if (max_rep_level > 0 && num_levels != 0) { rep_levels.resize(num_levels); // Using a different seed so that def_levels and rep_levels are different. From e8214734459eff5cfc9e67e8b1fdef46f6d8c2ea Mon Sep 17 00:00:00 2001 From: sgilmore10 <74676073+sgilmore10@users.noreply.github.com> Date: Tue, 18 Jul 2023 13:07:26 -0400 Subject: [PATCH 004/749] GH-36734: [MATLAB] template arrow::matlab::proxy::NumericArray on ArrowType instead of CType (#36738) ### Rationale for this change We decided to change the template parameter on `arrow::matlab::proxy::NumericArray` to `ArrowType` from `CType` to avoid writing duplicate code. If `proxy::NumericArray` is templated on `ArrowType`, we can use it to implement the proxies for `Date64Array`, `Date32Array`, `Time32Array`, `Time64Array`, and `TimestampArray`. This will help us avoid duplicating code. ### What changes are included in this PR? 1. Changed the template on `proxy::NumericArray` from `CType` to `ArrowType` 2. Re-implemented the C++ proxy object used for `TimestampArray` in terms of `proxy::NumericArray` 3. Defined a template specialization for `NumericArray::make` when the template parameter is `arrow::TimestampType` 4. Defined a `proxy::Traits` `struct` that is templated on `ArrowType`. Specializations of `Traits` define a`TypeProxy` typedef that can be used at compile-time to get the proxy class that is used to wrap an `ArrowType`. ### Are these changes tested? Existing tests used. ### Are there any user-facing changes? No. * Closes: #36734 Authored-by: Sarah Gilmore Signed-off-by: Kevin Gurney --- .../arrow/matlab/array/proxy/numeric_array.h | 66 +++++++++++-- .../matlab/array/proxy/timestamp_array.cc | 99 ------------------- .../matlab/array/proxy/timestamp_array.h | 43 -------- matlab/src/cpp/arrow/matlab/proxy/factory.cc | 23 +++-- .../src/cpp/arrow/matlab/type/proxy/traits.h | 90 +++++++++++++++++ .../cmake/BuildMatlabArrowInterface.cmake | 1 - 6 files changed, 158 insertions(+), 164 deletions(-) delete mode 100644 matlab/src/cpp/arrow/matlab/array/proxy/timestamp_array.cc delete mode 100644 matlab/src/cpp/arrow/matlab/array/proxy/timestamp_array.h create mode 100644 matlab/src/cpp/arrow/matlab/type/proxy/traits.h diff --git a/matlab/src/cpp/arrow/matlab/array/proxy/numeric_array.h b/matlab/src/cpp/arrow/matlab/array/proxy/numeric_array.h index c66c1d044fc12..f358e05db6318 100644 --- a/matlab/src/cpp/arrow/matlab/array/proxy/numeric_array.h +++ b/matlab/src/cpp/arrow/matlab/array/proxy/numeric_array.h @@ -24,7 +24,7 @@ #include "arrow/type_traits.h" #include "arrow/matlab/array/proxy/array.h" -#include "arrow/matlab/type/proxy/primitive_ctype.h" +#include "arrow/matlab/type/proxy/traits.h" #include "arrow/matlab/error/error.h" #include "arrow/matlab/bit/pack.h" @@ -33,20 +33,23 @@ #include "libmexclass/proxy/Proxy.h" +#include "arrow/matlab/type/time_unit.h" +#include "arrow/util/utf8.h" + namespace arrow::matlab::array::proxy { -template +template class NumericArray : public arrow::matlab::array::proxy::Array { public: - using ArrowType = typename arrow::CTypeTraits::ArrowType; NumericArray(const std::shared_ptr> numeric_array) : arrow::matlab::array::proxy::Array{std::move(numeric_array)} {} static libmexclass::proxy::MakeResult make(const libmexclass::proxy::FunctionArguments& constructor_arguments) { using MatlabBuffer = arrow::matlab::buffer::MatlabBuffer; + using CType = typename arrow::TypeTraits::CType; using NumericArray = arrow::NumericArray; - using NumericArrayProxy = typename arrow::matlab::array::proxy::NumericArray; + using NumericArrayProxy = typename proxy::NumericArray; ::matlab::data::StructArray opts = constructor_arguments[0]; @@ -68,10 +71,11 @@ class NumericArray : public arrow::matlab::array::proxy::Array { protected: void toMATLAB(libmexclass::proxy::method::Context& context) override { - using ArrowArrayType = typename arrow::CTypeTraits::ArrayType; + using CType = typename arrow::TypeTraits::CType; + using NumericArray = arrow::NumericArray; const auto num_elements = static_cast(array->length()); - const auto numeric_array = std::static_pointer_cast(array); + const auto numeric_array = std::static_pointer_cast(array); const CType* const data_begin = numeric_array->raw_values(); const CType* const data_end = data_begin + num_elements; @@ -83,11 +87,55 @@ class NumericArray : public arrow::matlab::array::proxy::Array { } std::shared_ptr typeProxy() override { - using ArrowTypeProxy = type::proxy::PrimitiveCType; + using TypeProxy = typename type::proxy::Traits::TypeProxy; auto type = std::static_pointer_cast(array->type()); - return std::make_shared(std::move(type)); + return std::make_shared(std::move(type)); } - }; + // Specialization of NumericArray::Make for arrow::TimestampType. + template <> + libmexclass::proxy::MakeResult NumericArray::make(const libmexclass::proxy::FunctionArguments& constructor_arguments) { + namespace mda = ::matlab::data; + using MatlabBuffer = arrow::matlab::buffer::MatlabBuffer; + using TimestampArray = arrow::TimestampArray; + using TimestampArrayProxy = arrow::matlab::array::proxy::NumericArray; + + mda::StructArray opts = constructor_arguments[0]; + + // Get the mxArray from constructor arguments + const mda::TypedArray timestamp_mda = opts[0]["MatlabArray"]; + const mda::TypedArray validity_bitmap_mda = opts[0]["Valid"]; + + const mda::TypedArray timezone_mda = opts[0]["TimeZone"]; + const mda::TypedArray units_mda = opts[0]["TimeUnit"]; + + // extract the time zone string + const std::u16string& u16_timezone = timezone_mda[0]; + MATLAB_ASSIGN_OR_ERROR(const auto timezone, + arrow::util::UTF16StringToUTF8(u16_timezone), + error::UNICODE_CONVERSION_ERROR_ID); + + // extract the time unit + const std::u16string& u16_timeunit = units_mda[0]; + MATLAB_ASSIGN_OR_ERROR(const auto time_unit, + arrow::matlab::type::timeUnitFromString(u16_timeunit), + error::UKNOWN_TIME_UNIT_ERROR_ID) + + // create the timestamp_type + auto data_type = arrow::timestamp(time_unit, timezone); + auto array_length = static_cast(timestamp_mda.getNumberOfElements()); // cast size_t to int64_t + + auto data_buffer = std::make_shared(timestamp_mda); + + // Pack the validity bitmap values. + MATLAB_ASSIGN_OR_ERROR(auto packed_validity_bitmap, + bit::packValid(validity_bitmap_mda), + error::BITPACK_VALIDITY_BITMAP_ERROR_ID); + + auto array_data = arrow::ArrayData::Make(data_type, array_length, {packed_validity_bitmap, data_buffer}); + auto timestamp_array = std::static_pointer_cast(arrow::MakeArray(array_data)); + return std::make_shared(std::move(timestamp_array)); + } + } diff --git a/matlab/src/cpp/arrow/matlab/array/proxy/timestamp_array.cc b/matlab/src/cpp/arrow/matlab/array/proxy/timestamp_array.cc deleted file mode 100644 index b9bbf3d7e7942..0000000000000 --- a/matlab/src/cpp/arrow/matlab/array/proxy/timestamp_array.cc +++ /dev/null @@ -1,99 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/matlab/array/proxy/timestamp_array.h" -#include "arrow/matlab/type/proxy/timestamp_type.h" - -#include "arrow/matlab/error/error.h" -#include "arrow/matlab/bit/pack.h" -#include "arrow/matlab/bit/unpack.h" -#include "arrow/matlab/buffer/matlab_buffer.h" - -#include "arrow/matlab/type/time_unit.h" -#include "arrow/util/utf8.h" -#include "arrow/type.h" - -namespace arrow::matlab::array::proxy { - - TimestampArray::TimestampArray(std::shared_ptr array) - : arrow::matlab::array::proxy::Array{std::move(array)} {} - - libmexclass::proxy::MakeResult TimestampArray::make(const libmexclass::proxy::FunctionArguments& constructor_arguments) { - namespace mda = ::matlab::data; - using MatlabBuffer = arrow::matlab::buffer::MatlabBuffer; - using TimestampArray = arrow::TimestampArray; - using TimestampArrayProxy = arrow::matlab::array::proxy::TimestampArray; - - mda::StructArray opts = constructor_arguments[0]; - - // Get the mxArray from constructor arguments - const mda::TypedArray timestamp_mda = opts[0]["MatlabArray"]; - const mda::TypedArray validity_bitmap_mda = opts[0]["Valid"]; - - const mda::TypedArray timezone_mda = opts[0]["TimeZone"]; - const mda::TypedArray units_mda = opts[0]["TimeUnit"]; - - // extract the time zone string - const std::u16string& u16_timezone = timezone_mda[0]; - MATLAB_ASSIGN_OR_ERROR(const auto timezone, - arrow::util::UTF16StringToUTF8(u16_timezone), - error::UNICODE_CONVERSION_ERROR_ID); - - // extract the time unit - const std::u16string& u16_timeunit = units_mda[0]; - MATLAB_ASSIGN_OR_ERROR(const auto time_unit, - arrow::matlab::type::timeUnitFromString(u16_timeunit), - error::UKNOWN_TIME_UNIT_ERROR_ID) - - // create the timestamp_type - auto data_type = arrow::timestamp(time_unit, timezone); - auto array_length = static_cast(timestamp_mda.getNumberOfElements()); // cast size_t to int64_t - - auto data_buffer = std::make_shared(timestamp_mda); - - // Pack the validity bitmap values. - MATLAB_ASSIGN_OR_ERROR(auto packed_validity_bitmap, - bit::packValid(validity_bitmap_mda), - error::BITPACK_VALIDITY_BITMAP_ERROR_ID); - - auto array_data = arrow::ArrayData::Make(data_type, array_length, {packed_validity_bitmap, data_buffer}); - auto timestamp_array = std::static_pointer_cast(arrow::MakeArray(array_data)); - return std::make_shared(std::move(timestamp_array)); - } - - void TimestampArray::toMATLAB(libmexclass::proxy::method::Context& context) { - namespace mda = ::matlab::data; - - const auto num_elements = static_cast(array->length()); - const auto timestamp_array = std::static_pointer_cast(array); - const int64_t* const data_begin = timestamp_array->raw_values(); - const int64_t* const data_end = data_begin + num_elements; - - mda::ArrayFactory factory; - - // Constructs a TypedArray from the raw values. Makes a copy. - mda::TypedArray result = factory.createArray({num_elements, 1}, data_begin, data_end); - context.outputs[0] = result; - } - - std::shared_ptr TimestampArray::typeProxy() { - using TimestampProxyType = type::proxy::TimestampType; - auto type = std::static_pointer_cast(array->type()); - return std::make_shared(std::move(type)); - - } -} diff --git a/matlab/src/cpp/arrow/matlab/array/proxy/timestamp_array.h b/matlab/src/cpp/arrow/matlab/array/proxy/timestamp_array.h deleted file mode 100644 index a312a129a21c2..0000000000000 --- a/matlab/src/cpp/arrow/matlab/array/proxy/timestamp_array.h +++ /dev/null @@ -1,43 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include "arrow/array.h" - -#include "arrow/matlab/array/proxy/array.h" - -#include "libmexclass/proxy/Proxy.h" - -#include "arrow/type_fwd.h" - -namespace arrow::matlab::array::proxy { - -class TimestampArray : public arrow::matlab::array::proxy::Array { - public: - TimestampArray(std::shared_ptr array); - - static libmexclass::proxy::MakeResult make(const libmexclass::proxy::FunctionArguments& constructor_arguments); - - protected: - void toMATLAB(libmexclass::proxy::method::Context& context) override; - - std::shared_ptr typeProxy() override; - -}; - -} diff --git a/matlab/src/cpp/arrow/matlab/proxy/factory.cc b/matlab/src/cpp/arrow/matlab/proxy/factory.cc index 0f7751035a052..2fb3207e590c6 100644 --- a/matlab/src/cpp/arrow/matlab/proxy/factory.cc +++ b/matlab/src/cpp/arrow/matlab/proxy/factory.cc @@ -18,7 +18,6 @@ #include "arrow/matlab/array/proxy/boolean_array.h" #include "arrow/matlab/array/proxy/numeric_array.h" #include "arrow/matlab/array/proxy/string_array.h" -#include "arrow/matlab/array/proxy/timestamp_array.h" #include "arrow/matlab/tabular/proxy/record_batch.h" #include "arrow/matlab/error/error.h" #include "arrow/matlab/type/proxy/primitive_ctype.h" @@ -30,19 +29,19 @@ namespace arrow::matlab::proxy { libmexclass::proxy::MakeResult Factory::make_proxy(const ClassName& class_name, const FunctionArguments& constructor_arguments) { - REGISTER_PROXY(arrow.array.proxy.Float32Array , arrow::matlab::array::proxy::NumericArray); - REGISTER_PROXY(arrow.array.proxy.Float64Array , arrow::matlab::array::proxy::NumericArray); - REGISTER_PROXY(arrow.array.proxy.UInt8Array , arrow::matlab::array::proxy::NumericArray); - REGISTER_PROXY(arrow.array.proxy.UInt16Array , arrow::matlab::array::proxy::NumericArray); - REGISTER_PROXY(arrow.array.proxy.UInt32Array , arrow::matlab::array::proxy::NumericArray); - REGISTER_PROXY(arrow.array.proxy.UInt64Array , arrow::matlab::array::proxy::NumericArray); - REGISTER_PROXY(arrow.array.proxy.Int8Array , arrow::matlab::array::proxy::NumericArray); - REGISTER_PROXY(arrow.array.proxy.Int16Array , arrow::matlab::array::proxy::NumericArray); - REGISTER_PROXY(arrow.array.proxy.Int32Array , arrow::matlab::array::proxy::NumericArray); - REGISTER_PROXY(arrow.array.proxy.Int64Array , arrow::matlab::array::proxy::NumericArray); + REGISTER_PROXY(arrow.array.proxy.Float32Array , arrow::matlab::array::proxy::NumericArray); + REGISTER_PROXY(arrow.array.proxy.Float64Array , arrow::matlab::array::proxy::NumericArray); + REGISTER_PROXY(arrow.array.proxy.UInt8Array , arrow::matlab::array::proxy::NumericArray); + REGISTER_PROXY(arrow.array.proxy.UInt16Array , arrow::matlab::array::proxy::NumericArray); + REGISTER_PROXY(arrow.array.proxy.UInt32Array , arrow::matlab::array::proxy::NumericArray); + REGISTER_PROXY(arrow.array.proxy.UInt64Array , arrow::matlab::array::proxy::NumericArray); + REGISTER_PROXY(arrow.array.proxy.Int8Array , arrow::matlab::array::proxy::NumericArray); + REGISTER_PROXY(arrow.array.proxy.Int16Array , arrow::matlab::array::proxy::NumericArray); + REGISTER_PROXY(arrow.array.proxy.Int32Array , arrow::matlab::array::proxy::NumericArray); + REGISTER_PROXY(arrow.array.proxy.Int64Array , arrow::matlab::array::proxy::NumericArray); REGISTER_PROXY(arrow.array.proxy.BooleanArray , arrow::matlab::array::proxy::BooleanArray); REGISTER_PROXY(arrow.array.proxy.StringArray , arrow::matlab::array::proxy::StringArray); - REGISTER_PROXY(arrow.array.proxy.TimestampArray, arrow::matlab::array::proxy::TimestampArray); + REGISTER_PROXY(arrow.array.proxy.TimestampArray, arrow::matlab::array::proxy::NumericArray); REGISTER_PROXY(arrow.tabular.proxy.RecordBatch , arrow::matlab::tabular::proxy::RecordBatch); REGISTER_PROXY(arrow.type.proxy.Float32Type , arrow::matlab::type::proxy::PrimitiveCType); REGISTER_PROXY(arrow.type.proxy.Float64Type , arrow::matlab::type::proxy::PrimitiveCType); diff --git a/matlab/src/cpp/arrow/matlab/type/proxy/traits.h b/matlab/src/cpp/arrow/matlab/type/proxy/traits.h new file mode 100644 index 0000000000000..3d9a957a5e3dc --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/type/proxy/traits.h @@ -0,0 +1,90 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/type_fwd.h" + +#include "arrow/matlab/type/proxy/primitive_ctype.h" +#include "arrow/matlab/type/proxy/timestamp_type.h" +#include "arrow/matlab/type/proxy/string_type.h" + +namespace arrow::matlab::type::proxy { + + template + struct Traits; + + template <> + struct Traits { + using TypeProxy = PrimitiveCType; + }; + + template <> + struct Traits { + using TypeProxy = PrimitiveCType; + }; + + template <> + struct Traits { + using TypeProxy = PrimitiveCType; + }; + + template <> + struct Traits { + using TypeProxy = PrimitiveCType; + }; + + template <> + struct Traits { + using TypeProxy = PrimitiveCType; + }; + + template <> + struct Traits { + using TypeProxy = PrimitiveCType; + }; + + template <> + struct Traits { + using TypeProxy = PrimitiveCType; + }; + + template <> + struct Traits { + using TypeProxy = PrimitiveCType; + }; + + template <> + struct Traits { + using TypeProxy = PrimitiveCType; + }; + + template <> + struct Traits { + using TypeProxy = PrimitiveCType; + }; + + template <> + struct Traits { + using TypeProxy = StringType; + }; + + template <> + struct Traits { + using TypeProxy = TimestampType; + }; +} diff --git a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake index 253632d221040..c10ce07280fa6 100644 --- a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake +++ b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake @@ -44,7 +44,6 @@ set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/src/c set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_SOURCES "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/array/proxy/array.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/array/proxy/boolean_array.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/array/proxy/string_array.cc" - "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/array/proxy/timestamp_array.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/tabular/proxy/record_batch.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/bit/pack.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/bit/unpack.cc" From c7741fb4e633222346269e80b35b5df48051b585 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Wed, 19 Jul 2023 07:41:07 +0800 Subject: [PATCH 005/749] GH-34588:[C++][Python] Add a MetaFunction for "dictionary_decode" (#35356) **Rationale for this change** This PR is for [Issue-34588](https://github.com/apache/arrow/issues/34588). Discussing with @ westonpace, a MetaFunction for "dictionary_decode" is implemented instead of adding a compute kernel. **What changes are included in this PR?** C++: Meta Function of dictionary_decode. Python: Test **Are these changes tested?** One test in tests/test_compute.py * Closes: #34588 Lead-authored-by: Junming Chen Co-authored-by: Alenka Frim Co-authored-by: Weston Pace Signed-off-by: Weston Pace --- cpp/src/arrow/compute/kernels/vector_hash.cc | 37 ++++++++++++++++++++ cpp/src/arrow/compute/registry.cc | 1 + cpp/src/arrow/compute/registry_internal.h | 1 + python/pyarrow/tests/test_compute.py | 11 ++++++ 4 files changed, 50 insertions(+) diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc index 2eab7ae8afaf2..a7bb2d88c291b 100644 --- a/cpp/src/arrow/compute/kernels/vector_hash.cc +++ b/cpp/src/arrow/compute/kernels/vector_hash.cc @@ -27,6 +27,7 @@ #include "arrow/array/dict_internal.h" #include "arrow/array/util.h" #include "arrow/compute/api_vector.h" +#include "arrow/compute/cast.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/result.h" #include "arrow/util/hashing.h" @@ -762,6 +763,38 @@ const FunctionDoc dictionary_encode_doc( ("Return a dictionary-encoded version of the input array."), {"array"}, "DictionaryEncodeOptions"); +// ---------------------------------------------------------------------- +// This function does not use any hashing utilities +// but is kept in this file to be near dictionary_encode +// Dictionary decode implementation + +const FunctionDoc dictionary_decode_doc{ + "Decodes a DictionaryArray to an Array", + ("Return a plain-encoded version of the array input\n" + "This function does nothing if the input is not a dictionary."), + {"dictionary_array"}}; + +class DictionaryDecodeMetaFunction : public MetaFunction { + public: + DictionaryDecodeMetaFunction() + : MetaFunction("dictionary_decode", Arity::Unary(), dictionary_decode_doc) {} + + Result ExecuteImpl(const std::vector& args, + const FunctionOptions* options, + ExecContext* ctx) const override { + if (args[0].type() == nullptr || args[0].type()->id() != Type::DICTIONARY) { + return args[0]; + } + + if (args[0].is_array() || args[0].is_chunked_array()) { + DictionaryType* dict_type = checked_cast(args[0].type().get()); + CastOptions cast_options = CastOptions::Safe(dict_type->value_type()); + return CallFunction("cast", args, &cast_options, ctx); + } else { + return Status::TypeError("Expected an Array or a Chunked Array"); + } + } +}; } // namespace void RegisterVectorHash(FunctionRegistry* registry) { @@ -819,6 +852,10 @@ void RegisterVectorHash(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunction(std::move(dict_encode))); } +void RegisterDictionaryDecode(FunctionRegistry* registry) { + DCHECK_OK(registry->AddFunction(std::make_shared())); +} + } // namespace internal } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/registry.cc b/cpp/src/arrow/compute/registry.cc index a4b484a2069ea..7a54f78a03736 100644 --- a/cpp/src/arrow/compute/registry.cc +++ b/cpp/src/arrow/compute/registry.cc @@ -275,6 +275,7 @@ static std::unique_ptr CreateBuiltInRegistry() { // Register core kernels RegisterScalarCast(registry.get()); + RegisterDictionaryDecode(registry.get()); RegisterVectorHash(registry.get()); RegisterVectorSelection(registry.get()); diff --git a/cpp/src/arrow/compute/registry_internal.h b/cpp/src/arrow/compute/registry_internal.h index b4239701d9573..cdc9f804e72f1 100644 --- a/cpp/src/arrow/compute/registry_internal.h +++ b/cpp/src/arrow/compute/registry_internal.h @@ -28,6 +28,7 @@ namespace internal { void RegisterScalarArithmetic(FunctionRegistry* registry); void RegisterScalarBoolean(FunctionRegistry* registry); void RegisterScalarCast(FunctionRegistry* registry); +void RegisterDictionaryDecode(FunctionRegistry* registry); void RegisterScalarComparison(FunctionRegistry* registry); void RegisterScalarIfElse(FunctionRegistry* registry); void RegisterScalarNested(FunctionRegistry* registry); diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index e47e5d3f3eb3b..98ab84c03900f 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -1756,6 +1756,17 @@ def test_logical(): assert pc.invert(a) == pa.array([False, True, True, None]) +def test_dictionary_decode(): + array = pa.array(["a", "a", "b", "c", "b"]) + dictionary_array = array.dictionary_encode() + dictionary_array_decode = pc.dictionary_decode(dictionary_array) + + assert array != dictionary_array + + assert array == dictionary_array_decode + assert array == pc.dictionary_decode(array) + + def test_cast(): arr = pa.array([1, 2, 3, 4], type='int64') options = pc.CastOptions(pa.int8()) From 1640a90e93a5e5abc72910186e649355c3affc99 Mon Sep 17 00:00:00 2001 From: mwish Date: Wed, 19 Jul 2023 12:49:26 +0800 Subject: [PATCH 006/749] MINOR: [Dev] Add js8544 as collaborator (#36764) ### Rationale for this change Add @ js8544 as github collaborator ### What changes are included in this PR? Add in `.asf.yaml` ### Are these changes tested? no ### Are there any user-facing changes? no Lead-authored-by: mwish Co-authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- .asf.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.asf.yaml b/.asf.yaml index 9bd4e0ef42b7f..1e7fcf1e07ece 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -24,6 +24,7 @@ github: - danepitkin - davisusanibar - felipecrv + - js8544 - mapleFU notifications: From 3ce9e1df3203b841c8e7815cc10e055ca97760cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Wed, 19 Jul 2023 09:34:04 +0200 Subject: [PATCH 007/749] GH-36756: [CI][Python] Install Cython < 3.0 on verify-release-candidate script (#36757) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Some of our verification tasks fail for 13.0.0 ### What changes are included in this PR? Pin Cython to be less than 3.0 ### Are these changes tested? Archery ### Are there any user-facing changes? No * Closes: #36756 Authored-by: Raúl Cumplido Signed-off-by: Joris Van den Bossche --- dev/release/verify-release-candidate.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index 8c5de9bda85aa..ce31b497c1fab 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -665,7 +665,7 @@ test_python() { show_header "Build and test Python libraries" # Build and test Python - maybe_setup_virtualenv cython numpy setuptools_scm setuptools || exit 1 + maybe_setup_virtualenv "cython<3" numpy setuptools_scm setuptools || exit 1 maybe_setup_conda --file ci/conda_env_python.txt || exit 1 if [ "${USE_CONDA}" -gt 0 ]; then From c3c20ce221a0233af8305a4eaedb75694ea40d56 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Wed, 19 Jul 2023 17:25:51 +0900 Subject: [PATCH 008/749] GH-36663: [C++] Fix the default value information for enum options (#36684) ### Rationale for this change The default for `ARROW_SIMD_LEVEL` is described as `NONE` but it's actually `DEFAULT`. The default for `ARROW_RUNTIME_SIMD_LEVEL` is described as `NONE` but it's actually `MAX`. ### What changes are included in this PR? Reorder possible values to put the default value as the first element. Before: -- ARROW_SIMD_LEVEL=SSE4_2 [default=NONE|SSE4_2|AVX2|AVX512|NEON|SVE|SVE128|SVE256|SVE512|DEFAULT] -- ARROW_RUNTIME_SIMD_LEVEL=MAX [default=NONE|SSE4_2|AVX2|AVX512|MAX] After: -- ARROW_SIMD_LEVEL=SSE4_2 [default=DEFAULT|NONE|SSE4_2|AVX2|AVX512|NEON|SVE|SVE128|SVE256|SVE512] -- ARROW_RUNTIME_SIMD_LEVEL=MAX [default=MAX|NONE|SSE4_2|AVX2|AVX512] ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. * Closes: #36663 Authored-by: Sutou Kouhei Signed-off-by: Antoine Pitrou --- cpp/cmake_modules/DefineOptions.cmake | 30 +++++++-------------------- 1 file changed, 8 insertions(+), 22 deletions(-) diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake index d20af060453b0..f32bb2bcf7290 100644 --- a/cpp/cmake_modules/DefineOptions.cmake +++ b/cpp/cmake_modules/DefineOptions.cmake @@ -31,24 +31,6 @@ function(check_description_length name description) endforeach() endfunction() -function(list_join lst glue out) - if("${${lst}}" STREQUAL "") - set(${out} - "" - PARENT_SCOPE) - return() - endif() - - list(GET ${lst} 0 joined) - list(REMOVE_AT ${lst} 0) - foreach(item ${${lst}}) - set(joined "${joined}${glue}${item}") - endforeach() - set(${out} - ${joined} - PARENT_SCOPE) -endfunction() - macro(define_option name description default) set(options) set(one_value_args) @@ -63,7 +45,7 @@ macro(define_option name description default) endif() check_description_length(${name} ${description}) - list_join(description "\n" multiline_description) + list(JOIN description "\n" multiline_description) option(${name} "${multiline_description}" ${default}) @@ -76,7 +58,7 @@ endmacro() macro(define_option_string name description default) check_description_length(${name} ${description}) - list_join(description "\n" multiline_description) + list(JOIN description "\n" multiline_description) set(${name} ${default} @@ -87,8 +69,12 @@ macro(define_option_string name description default) set("${name}_OPTION_DEFAULT" "\"${default}\"") set("${name}_OPTION_TYPE" "string") set("${name}_OPTION_POSSIBLE_VALUES" ${ARGN}) - - list_join("${name}_OPTION_POSSIBLE_VALUES" "|" "${name}_OPTION_ENUM") + list(FIND ${name}_OPTION_POSSIBLE_VALUES "${default}" default_value_index) + if(NOT ${default_value_index} EQUAL -1) + list(REMOVE_AT ${name}_OPTION_POSSIBLE_VALUES ${default_value_index}) + list(PREPEND ${name}_OPTION_POSSIBLE_VALUES "${default}") + endif() + list(JOIN "${name}_OPTION_POSSIBLE_VALUES" "|" "${name}_OPTION_ENUM") if(NOT ("${${name}_OPTION_ENUM}" STREQUAL "")) set_property(CACHE ${name} PROPERTY STRINGS "${name}_OPTION_POSSIBLE_VALUES") endif() From 7ad300390e8a193f242a7776b941a1d5fc160c06 Mon Sep 17 00:00:00 2001 From: Jinpeng Date: Wed, 19 Jul 2023 04:29:27 -0400 Subject: [PATCH 009/749] PARQUET-2323: [C++] Use bitmap to store pre-buffered column chunks (#36649) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change In https://issues.apache.org/jira/browse/PARQUET-2316 we allow partial buffer in parquet File Reader by storing prebuffered column chunk index in a hash set, and make a copy of this hash set for each rowgroup reader. In extreme conditions where numerous columns are prebuffered and multiple rowgroup readers are created for the same row group , the hash set would incur significant overhead.  Using a bitmap instead (with one bit per column chunk indicating whether it's prebuffered or not) would be a reasonsable mitigation, taking 4KB for 32K columns. ### What changes are included in this PR? Switch from a hash set to a bitmap buffer. ### Are these changes tested? Yes, passed unit tests on partial prebuffer. ### Are there any user-facing changes? No. Lead-authored-by: jp0317 Co-authored-by: Jinpeng Co-authored-by: Gang Wu Signed-off-by: Antoine Pitrou --- .../parquet/arrow/arrow_reader_writer_test.cc | 4 +-- cpp/src/parquet/file_reader.cc | 33 +++++++++++-------- 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 69827d5c464b9..8585b1ccf11aa 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -2413,9 +2413,9 @@ TEST(TestArrowReadWrite, CoalescedReadsAndNonCoalescedReads) { ASSERT_EQ(2, reader->num_row_groups()); - // Pre-buffer 3 columns in the 2nd row group. + // Pre-buffer column 0 and column 3 in the 2nd row group. const std::vector row_groups = {1}; - const std::vector column_indices = {0, 1, 4}; + const std::vector column_indices = {0, 3}; reader->parquet_reader()->PreBuffer(row_groups, column_indices, ::arrow::io::IOContext(), ::arrow::io::CacheOptions::Defaults()); diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index fc30ddb43f29c..adda9a027bded 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -29,6 +29,7 @@ #include "arrow/io/caching.h" #include "arrow/io/file.h" #include "arrow/io/memory.h" +#include "arrow/util/bit_util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/future.h" #include "arrow/util/int_util_overflow.h" @@ -179,7 +180,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::shared_ptr<::arrow::io::internal::ReadRangeCache> cached_source, int64_t source_size, FileMetaData* file_metadata, int row_group_number, const ReaderProperties& props, - std::unordered_set prebuffered_column_chunks, + std::shared_ptr prebuffered_column_chunks_bitmap, std::shared_ptr file_decryptor = nullptr) : source_(std::move(source)), cached_source_(std::move(cached_source)), @@ -187,7 +188,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { file_metadata_(file_metadata), properties_(props), row_group_ordinal_(row_group_number), - prebuffered_column_chunks_(std::move(prebuffered_column_chunks)), + prebuffered_column_chunks_bitmap_(std::move(prebuffered_column_chunks_bitmap)), file_decryptor_(file_decryptor) { row_group_metadata_ = file_metadata->RowGroup(row_group_number); } @@ -203,8 +204,8 @@ class SerializedRowGroup : public RowGroupReader::Contents { ::arrow::io::ReadRange col_range = ComputeColumnChunkRange(file_metadata_, source_size_, row_group_ordinal_, i); std::shared_ptr stream; - if (cached_source_ && - prebuffered_column_chunks_.find(i) != prebuffered_column_chunks_.end()) { + if (cached_source_ && prebuffered_column_chunks_bitmap_ != nullptr && + ::arrow::bit_util::GetBit(prebuffered_column_chunks_bitmap_->data(), i)) { // PARQUET-1698: if read coalescing is enabled, read from pre-buffered // segments. PARQUET_ASSIGN_OR_THROW(auto buffer, cached_source_->Read(col_range)); @@ -272,7 +273,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::unique_ptr row_group_metadata_; ReaderProperties properties_; int row_group_ordinal_; - const std::unordered_set prebuffered_column_chunks_; + const std::shared_ptr prebuffered_column_chunks_bitmap_; std::shared_ptr file_decryptor_; }; @@ -302,17 +303,17 @@ class SerializedFile : public ParquetFileReader::Contents { } std::shared_ptr GetRowGroup(int i) override { - std::unordered_set prebuffered_column_chunks; - // Avoid updating the map as this function can be called concurrently. The map can - // only be updated within Prebuffer(). + std::shared_ptr prebuffered_column_chunks_bitmap; + // Avoid updating the bitmap as this function can be called concurrently. The bitmap + // can only be updated within Prebuffer(). auto prebuffered_column_chunks_iter = prebuffered_column_chunks_.find(i); if (prebuffered_column_chunks_iter != prebuffered_column_chunks_.end()) { - prebuffered_column_chunks = prebuffered_column_chunks_iter->second; + prebuffered_column_chunks_bitmap = prebuffered_column_chunks_iter->second; } std::unique_ptr contents = std::make_unique( source_, cached_source_, source_size_, file_metadata_.get(), i, properties_, - std::move(prebuffered_column_chunks), file_decryptor_); + std::move(prebuffered_column_chunks_bitmap), file_decryptor_); return std::make_shared(std::move(contents)); } @@ -366,9 +367,12 @@ class SerializedFile : public ParquetFileReader::Contents { std::vector<::arrow::io::ReadRange> ranges; prebuffered_column_chunks_.clear(); for (int row : row_groups) { - std::unordered_set& prebuffered = prebuffered_column_chunks_[row]; + std::shared_ptr& col_bitmap = prebuffered_column_chunks_[row]; + int num_cols = file_metadata_->num_columns(); + PARQUET_THROW_NOT_OK( + AllocateEmptyBitmap(num_cols, properties_.memory_pool()).Value(&col_bitmap)); for (int col : column_indices) { - prebuffered.insert(col); + ::arrow::bit_util::SetBit(col_bitmap->mutable_data(), col); ranges.push_back( ComputeColumnChunkRange(file_metadata_.get(), source_size_, row, col)); } @@ -578,8 +582,9 @@ class SerializedFile : public ParquetFileReader::Contents { ReaderProperties properties_; std::shared_ptr page_index_reader_; std::unique_ptr bloom_filter_reader_; - // Maps a row group to its column chunks that are cached via Prebuffer(). - std::unordered_map> prebuffered_column_chunks_; + // Maps row group ordinal and prebuffer status of its column chunks in the form of a + // bitmap buffer. + std::unordered_map> prebuffered_column_chunks_; std::shared_ptr file_decryptor_; // \return The true length of the metadata in bytes From a7958d9a68ed1fc502199bcccf948b34110b8e33 Mon Sep 17 00:00:00 2001 From: Ben Harkins <60872452+benibus@users.noreply.github.com> Date: Wed, 19 Jul 2023 04:45:54 -0400 Subject: [PATCH 010/749] GH-36326: [C++] Remove APIs deprecated in v9.0 or earlier (#36675) ### Rationale for this change General cleanup of C++ APIs deprecated for at least a year. ### What changes are included in this PR? Removes any APIs annotated with `ARROW_DEPRECATED` and an included version `<=9.0`. In a few cases, tests needed to be removed or slightly altered. ### Are these changes tested? Yes (covered by existing tests) when applicable. ### Are there any user-facing changes? Yes * Closes: #36326 Authored-by: benibus Signed-off-by: Gang Wu --- cpp/src/arrow/array/array_test.cc | 19 +--- cpp/src/arrow/array/builder_adaptive.h | 2 - cpp/src/arrow/array/builder_base.cc | 8 -- cpp/src/arrow/array/builder_base.h | 9 -- cpp/src/arrow/compute/api_scalar.cc | 28 ------ cpp/src/arrow/compute/api_scalar.h | 18 ---- cpp/src/arrow/compute/api_vector.cc | 7 -- cpp/src/arrow/compute/api_vector.h | 8 -- cpp/src/arrow/csv/reader.cc | 21 ----- cpp/src/arrow/csv/reader.h | 13 --- cpp/src/arrow/flight/client.cc | 78 ---------------- cpp/src/arrow/flight/client.h | 93 ------------------- cpp/src/arrow/flight/flight_internals_test.cc | 21 ----- cpp/src/arrow/flight/server.h | 8 -- cpp/src/arrow/flight/types.cc | 78 ---------------- cpp/src/arrow/flight/types.h | 60 ------------ cpp/src/arrow/io/interfaces.h | 4 - cpp/src/arrow/record_batch.cc | 8 -- cpp/src/arrow/record_batch.h | 6 -- cpp/src/arrow/record_batch_test.cc | 28 ------ cpp/src/arrow/table_builder.cc | 24 ----- cpp/src/arrow/table_builder.h | 31 ------- cpp/src/parquet/arrow/reader.cc | 10 -- cpp/src/parquet/arrow/reader.h | 27 ++---- 24 files changed, 11 insertions(+), 598 deletions(-) diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 555e40b7b308a..0b82a82fbdb26 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -1727,21 +1727,6 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendValuesStdBool) { this->Check(this->builder_nn_, false); } -TYPED_TEST(TestPrimitiveBuilder, TestAdvance) { - ARROW_SUPPRESS_DEPRECATION_WARNING - int64_t n = 1000; - ASSERT_OK(this->builder_->Reserve(n)); - - ASSERT_OK(this->builder_->Advance(100)); - ASSERT_EQ(100, this->builder_->length()); - - ASSERT_OK(this->builder_->Advance(900)); - - int64_t too_many = this->builder_->capacity() - 1000 + 1; - ASSERT_RAISES(Invalid, this->builder_->Advance(too_many)); - ARROW_UNSUPPRESS_DEPRECATION_WARNING -} - TYPED_TEST(TestPrimitiveBuilder, TestResize) { int64_t cap = kMinBuilderCapacity * 2; @@ -1757,9 +1742,7 @@ TYPED_TEST(TestPrimitiveBuilder, TestReserve) { ASSERT_OK(this->builder_->Reserve(100)); ASSERT_EQ(0, this->builder_->length()); ASSERT_GE(100, this->builder_->capacity()); - ARROW_SUPPRESS_DEPRECATION_WARNING - ASSERT_OK(this->builder_->Advance(100)); - ARROW_UNSUPPRESS_DEPRECATION_WARNING + ASSERT_OK(this->builder_->AppendEmptyValues(100)); ASSERT_EQ(100, this->builder_->length()); ASSERT_GE(100, this->builder_->capacity()); diff --git a/cpp/src/arrow/array/builder_adaptive.h b/cpp/src/arrow/array/builder_adaptive.h index 382c35789c4e0..0cea571be3e32 100644 --- a/cpp/src/arrow/array/builder_adaptive.h +++ b/cpp/src/arrow/array/builder_adaptive.h @@ -142,7 +142,6 @@ class ARROW_EXPORT AdaptiveUIntBuilder : public internal::AdaptiveIntBuilderBase explicit AdaptiveUIntBuilder(MemoryPool* pool = default_memory_pool()) : AdaptiveUIntBuilder(sizeof(uint8_t), pool) {} - using ArrayBuilder::Advance; using internal::AdaptiveIntBuilderBase::Reset; /// Scalar append @@ -182,7 +181,6 @@ class ARROW_EXPORT AdaptiveIntBuilder : public internal::AdaptiveIntBuilderBase int64_t alignment = kDefaultBufferAlignment) : AdaptiveIntBuilder(sizeof(uint8_t), pool, alignment) {} - using ArrayBuilder::Advance; using internal::AdaptiveIntBuilderBase::Reset; /// Scalar append diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc index 70da1fbb2966a..3000aea3e189a 100644 --- a/cpp/src/arrow/array/builder_base.cc +++ b/cpp/src/arrow/array/builder_base.cc @@ -91,14 +91,6 @@ Status ArrayBuilder::Resize(int64_t capacity) { return null_bitmap_builder_.Resize(capacity); } -Status ArrayBuilder::Advance(int64_t elements) { - if (length_ + elements > capacity_) { - return Status::Invalid("Builder must be expanded"); - } - length_ += elements; - return null_bitmap_builder_.Advance(elements); -} - namespace { template diff --git a/cpp/src/arrow/array/builder_base.h b/cpp/src/arrow/array/builder_base.h index abbd61be80359..05af850fd149c 100644 --- a/cpp/src/arrow/array/builder_base.h +++ b/cpp/src/arrow/array/builder_base.h @@ -180,15 +180,6 @@ class ARROW_EXPORT ArrayBuilder { return Status::NotImplemented("AppendArraySlice for builder for ", *type()); } - /// For cases where raw data was memcpy'd into the internal buffers, allows us - /// to advance the length of the builder. It is your responsibility to use - /// this function responsibly. - ARROW_DEPRECATED( - "Deprecated in 6.0.0. ArrayBuilder::Advance is poorly supported and mostly " - "untested.\nFor low-level control over buffer construction, use BufferBuilder " - "or TypedBufferBuilder directly.") - Status Advance(int64_t elements); - /// \brief Return result of builder as an internal generic ArrayData /// object. Resets builder except for dictionary builder /// diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index ae7e82fb2f9e4..d7a61d0a55985 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -775,34 +775,6 @@ SCALAR_EAGER_BINARY(Or, "or") SCALAR_EAGER_BINARY(Xor, "xor") SCALAR_EAGER_UNARY(Invert, "invert") -// ---------------------------------------------------------------------- - -Result Compare(const Datum& left, const Datum& right, CompareOptions options, - ExecContext* ctx) { - std::string func_name; - switch (options.op) { - case CompareOperator::EQUAL: - func_name = "equal"; - break; - case CompareOperator::NOT_EQUAL: - func_name = "not_equal"; - break; - case CompareOperator::GREATER: - func_name = "greater"; - break; - case CompareOperator::GREATER_EQUAL: - func_name = "greater_equal"; - break; - case CompareOperator::LESS: - func_name = "less"; - break; - case CompareOperator::LESS_EQUAL: - func_name = "less_equal"; - break; - } - return CallFunction(func_name, {left, right}, nullptr, ctx); -} - // ---------------------------------------------------------------------- // Validity functions diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 10a2b4bffde6d..0a06a2829f0da 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -970,24 +970,6 @@ Result RoundTemporal( const Datum& arg, RoundTemporalOptions options = RoundTemporalOptions::Defaults(), ExecContext* ctx = NULLPTR); -/// \brief Compare a numeric array with a scalar. -/// -/// \param[in] left datum to compare, must be an Array -/// \param[in] right datum to compare, must be a Scalar of the same type than -/// left Datum. -/// \param[in] options compare options -/// \param[in] ctx the function execution context, optional -/// \return resulting datum -/// -/// Note on floating point arrays, this uses ieee-754 compare semantics. -/// -/// \since 1.0.0 -/// \note API not yet finalized -ARROW_DEPRECATED("Deprecated in 5.0.0. Use each compare function directly") -ARROW_EXPORT -Result Compare(const Datum& left, const Datum& right, CompareOptions options, - ExecContext* ctx = NULLPTR); - /// \brief Invert the values of a boolean datum /// \param[in] value datum to invert /// \param[in] ctx the function execution context, optional diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc index 67595c3308f9b..f73b10e11edd7 100644 --- a/cpp/src/arrow/compute/api_vector.cc +++ b/cpp/src/arrow/compute/api_vector.cc @@ -417,12 +417,5 @@ Result CumulativeMin(const Datum& values, const CumulativeOptions& option return CallFunction("cumulative_min", {Datum(values)}, &options, ctx); } -// ---------------------------------------------------------------------- -// Deprecated functions - -Result> SortToIndices(const Array& values, ExecContext* ctx) { - return SortIndices(values, SortOrder::Ascending, ctx); -} - } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index c85db1aa3ba88..4f226ac00788a 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -683,13 +683,5 @@ Result> PairwiseDiff(const Array& array, bool check_overflow = false, ExecContext* ctx = NULLPTR); -// ---------------------------------------------------------------------- -// Deprecated functions - -ARROW_DEPRECATED("Deprecated in 3.0.0. Use SortIndices()") -ARROW_EXPORT -Result> SortToIndices(const Array& values, - ExecContext* ctx = NULLPTR); - } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/csv/reader.cc b/cpp/src/arrow/csv/reader.cc index fdc7fcb1380e5..bf703b6c6ba28 100644 --- a/cpp/src/arrow/csv/reader.cc +++ b/cpp/src/arrow/csv/reader.cc @@ -1246,27 +1246,6 @@ Result> TableReader::Make( parse_options, convert_options); } -Result> TableReader::Make( - MemoryPool* pool, io::IOContext io_context, std::shared_ptr input, - const ReadOptions& read_options, const ParseOptions& parse_options, - const ConvertOptions& convert_options) { - return MakeTableReader(pool, io_context, std::move(input), read_options, parse_options, - convert_options); -} - -Result> StreamingReader::Make( - MemoryPool* pool, std::shared_ptr input, - const ReadOptions& read_options, const ParseOptions& parse_options, - const ConvertOptions& convert_options) { - auto io_context = io::IOContext(pool); - auto cpu_executor = arrow::internal::GetCpuThreadPool(); - auto reader_fut = MakeStreamingReader(io_context, std::move(input), cpu_executor, - read_options, parse_options, convert_options); - auto reader_result = reader_fut.result(); - ARROW_ASSIGN_OR_RAISE(auto reader, reader_result); - return reader; -} - Result> StreamingReader::Make( io::IOContext io_context, std::shared_ptr input, const ReadOptions& read_options, const ParseOptions& parse_options, diff --git a/cpp/src/arrow/csv/reader.h b/cpp/src/arrow/csv/reader.h index 03b953d0055e5..bae301dc14815 100644 --- a/cpp/src/arrow/csv/reader.h +++ b/cpp/src/arrow/csv/reader.h @@ -52,13 +52,6 @@ class ARROW_EXPORT TableReader { const ReadOptions&, const ParseOptions&, const ConvertOptions&); - - ARROW_DEPRECATED( - "Deprecated in 4.0.0. " - "Use MemoryPool-less variant (the IOContext holds a pool already)") - static Result> Make( - MemoryPool* pool, io::IOContext io_context, std::shared_ptr input, - const ReadOptions&, const ParseOptions&, const ConvertOptions&); }; /// \brief A class that reads a CSV file incrementally @@ -105,12 +98,6 @@ class ARROW_EXPORT StreamingReader : public RecordBatchReader { static Result> Make( io::IOContext io_context, std::shared_ptr input, const ReadOptions&, const ParseOptions&, const ConvertOptions&); - - ARROW_DEPRECATED("Deprecated in 4.0.0. Use IOContext-based overload") - static Result> Make( - MemoryPool* pool, std::shared_ptr input, - const ReadOptions& read_options, const ParseOptions& parse_options, - const ConvertOptions& convert_options); }; /// \brief Count the logical rows of data in a CSV file (i.e. the diff --git a/cpp/src/arrow/flight/client.cc b/cpp/src/arrow/flight/client.cc index e5e9f141aa62b..ec5377b7c11dc 100644 --- a/cpp/src/arrow/flight/client.cc +++ b/cpp/src/arrow/flight/client.cc @@ -79,16 +79,6 @@ arrow::Result> FlightStreamReader::ToTable( return Table::FromRecordBatches(schema, std::move(batches)); } -Status FlightStreamReader::ReadAll(std::vector>* batches, - const StopToken& stop_token) { - return ToRecordBatches(stop_token).Value(batches); -} - -Status FlightStreamReader::ReadAll(std::shared_ptr* table, - const StopToken& stop_token) { - return ToTable(stop_token).Value(table); -} - /// \brief An ipc::MessageReader adapting the Flight ClientDataStream interface. /// /// In order to support app_metadata and reuse the existing IPC @@ -520,11 +510,6 @@ arrow::Result> FlightClient::Connect( return Connect(location, FlightClientOptions::Defaults()); } -Status FlightClient::Connect(const Location& location, - std::unique_ptr* client) { - return Connect(location, FlightClientOptions::Defaults()).Value(client); -} - arrow::Result> FlightClient::Connect( const Location& location, const FlightClientOptions& options) { flight::transport::grpc::InitializeFlightGrpcClient(); @@ -538,11 +523,6 @@ arrow::Result> FlightClient::Connect( return client; } -Status FlightClient::Connect(const Location& location, const FlightClientOptions& options, - std::unique_ptr* client) { - return Connect(location, options).Value(client); -} - Status FlightClient::Authenticate(const FlightCallOptions& options, std::unique_ptr auth_handler) { RETURN_NOT_OK(CheckOpen()); @@ -564,11 +544,6 @@ arrow::Result> FlightClient::DoAction( return results; } -Status FlightClient::DoAction(const FlightCallOptions& options, const Action& action, - std::unique_ptr* results) { - return DoAction(options, action).Value(results); -} - arrow::Result FlightClient::CancelFlightInfo( const FlightCallOptions& options, const CancelFlightInfoRequest& request) { ARROW_ASSIGN_OR_RAISE(auto body, request.SerializeToString()); @@ -601,11 +576,6 @@ arrow::Result> FlightClient::ListActions( return actions; } -Status FlightClient::ListActions(const FlightCallOptions& options, - std::vector* actions) { - return ListActions(options).Value(actions); -} - arrow::Result> FlightClient::GetFlightInfo( const FlightCallOptions& options, const FlightDescriptor& descriptor) { std::unique_ptr info; @@ -614,32 +584,16 @@ arrow::Result> FlightClient::GetFlightInfo( return info; } -Status FlightClient::GetFlightInfo(const FlightCallOptions& options, - const FlightDescriptor& descriptor, - std::unique_ptr* info) { - return GetFlightInfo(options, descriptor).Value(info); -} - arrow::Result> FlightClient::GetSchema( const FlightCallOptions& options, const FlightDescriptor& descriptor) { RETURN_NOT_OK(CheckOpen()); return transport_->GetSchema(options, descriptor); } -Status FlightClient::GetSchema(const FlightCallOptions& options, - const FlightDescriptor& descriptor, - std::unique_ptr* schema_result) { - return GetSchema(options, descriptor).Value(schema_result); -} - arrow::Result> FlightClient::ListFlights() { return ListFlights({}, {}); } -Status FlightClient::ListFlights(std::unique_ptr* listing) { - return ListFlights({}, {}).Value(listing); -} - arrow::Result> FlightClient::ListFlights( const FlightCallOptions& options, const Criteria& criteria) { std::unique_ptr listing; @@ -648,12 +602,6 @@ arrow::Result> FlightClient::ListFlights( return listing; } -Status FlightClient::ListFlights(const FlightCallOptions& options, - const Criteria& criteria, - std::unique_ptr* listing) { - return ListFlights(options, criteria).Value(listing); -} - arrow::Result> FlightClient::DoGet( const FlightCallOptions& options, const Ticket& ticket) { RETURN_NOT_OK(CheckOpen()); @@ -668,11 +616,6 @@ arrow::Result> FlightClient::DoGet( return stream_reader; } -Status FlightClient::DoGet(const FlightCallOptions& options, const Ticket& ticket, - std::unique_ptr* stream) { - return DoGet(options, ticket).Value(stream); -} - arrow::Result FlightClient::DoPut( const FlightCallOptions& options, const FlightDescriptor& descriptor, const std::shared_ptr& schema) { @@ -689,17 +632,6 @@ arrow::Result FlightClient::DoPut( return result; } -Status FlightClient::DoPut(const FlightCallOptions& options, - const FlightDescriptor& descriptor, - const std::shared_ptr& schema, - std::unique_ptr* writer, - std::unique_ptr* reader) { - ARROW_ASSIGN_OR_RAISE(auto result, DoPut(options, descriptor, schema)); - *writer = std::move(result.writer); - *reader = std::move(result.reader); - return Status::OK(); -} - arrow::Result FlightClient::DoExchange( const FlightCallOptions& options, const FlightDescriptor& descriptor) { RETURN_NOT_OK(CheckOpen()); @@ -717,16 +649,6 @@ arrow::Result FlightClient::DoExchange( return result; } -Status FlightClient::DoExchange(const FlightCallOptions& options, - const FlightDescriptor& descriptor, - std::unique_ptr* writer, - std::unique_ptr* reader) { - ARROW_ASSIGN_OR_RAISE(auto result, DoExchange(options, descriptor)); - *writer = std::move(result.writer); - *reader = std::move(result.reader); - return Status::OK(); -} - Status FlightClient::Close() { if (!closed_) { closed_ = true; diff --git a/cpp/src/arrow/flight/client.h b/cpp/src/arrow/flight/client.h index ba9f688dce8b7..7204b469a6127 100644 --- a/cpp/src/arrow/flight/client.h +++ b/cpp/src/arrow/flight/client.h @@ -139,17 +139,9 @@ class ARROW_FLIGHT_EXPORT FlightStreamReader : public MetadataRecordBatchReader virtual arrow::Result>> ToRecordBatches( const StopToken& stop_token) = 0; - using MetadataRecordBatchReader::ReadAll; - ARROW_DEPRECATED("Deprecated in 8.0.0. Use ToRecordBatches instead.") - Status ReadAll(std::vector>* batches, - const StopToken& stop_token); - using MetadataRecordBatchReader::ToTable; /// \brief Consume entire stream as a Table arrow::Result> ToTable(const StopToken& stop_token); - - ARROW_DEPRECATED("Deprecated in 8.0.0. Use ToTable instead.") - Status ReadAll(std::shared_ptr
* table, const StopToken& stop_token); }; // Silence warning @@ -196,9 +188,6 @@ class ARROW_FLIGHT_EXPORT FlightClient { /// the connection was successful static arrow::Result> Connect(const Location& location); - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - static Status Connect(const Location& location, std::unique_ptr* client); - /// \brief Connect to an unauthenticated flight service /// \param[in] location the URI /// \param[in] options Other options for setting up the client @@ -207,10 +196,6 @@ class ARROW_FLIGHT_EXPORT FlightClient { static arrow::Result> Connect( const Location& location, const FlightClientOptions& options); - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - static Status Connect(const Location& location, const FlightClientOptions& options, - std::unique_ptr* client); - /// \brief Authenticate to the server using the given handler. /// \param[in] options Per-RPC options /// \param[in] auth_handler The authentication mechanism to use @@ -239,14 +224,6 @@ class ARROW_FLIGHT_EXPORT FlightClient { return DoAction({}, action); } - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status DoAction(const FlightCallOptions& options, const Action& action, - std::unique_ptr* results); - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status DoAction(const Action& action, std::unique_ptr* results) { - return DoAction({}, action).Value(results); - } - /// \brief Perform the CancelFlightInfo action, returning a /// CancelFlightInfoResult /// @@ -281,13 +258,6 @@ class ARROW_FLIGHT_EXPORT FlightClient { return ListActions(FlightCallOptions()); } - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status ListActions(const FlightCallOptions& options, std::vector* actions); - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status ListActions(std::vector* actions) { - return ListActions().Value(actions); - } - /// \brief Request access plan for a single flight, which may be an existing /// dataset or a command to be executed /// \param[in] options Per-RPC options @@ -301,16 +271,6 @@ class ARROW_FLIGHT_EXPORT FlightClient { return GetFlightInfo({}, descriptor); } - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status GetFlightInfo(const FlightCallOptions& options, - const FlightDescriptor& descriptor, - std::unique_ptr* info); - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status GetFlightInfo(const FlightDescriptor& descriptor, - std::unique_ptr* info) { - return GetFlightInfo({}, descriptor).Value(info); - } - /// \brief Request schema for a single flight, which may be an existing /// dataset or a command to be executed /// \param[in] options Per-RPC options @@ -320,27 +280,15 @@ class ARROW_FLIGHT_EXPORT FlightClient { arrow::Result> GetSchema( const FlightCallOptions& options, const FlightDescriptor& descriptor); - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status GetSchema(const FlightCallOptions& options, const FlightDescriptor& descriptor, - std::unique_ptr* schema_result); - arrow::Result> GetSchema( const FlightDescriptor& descriptor) { return GetSchema({}, descriptor); } - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status GetSchema(const FlightDescriptor& descriptor, - std::unique_ptr* schema_result) { - return GetSchema({}, descriptor).Value(schema_result); - } /// \brief List all available flights known to the server /// \return Arrow result with an iterator that returns a FlightInfo for each flight arrow::Result> ListFlights(); - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status ListFlights(std::unique_ptr* listing); - /// \brief List available flights given indicated filter criteria /// \param[in] options Per-RPC options /// \param[in] criteria the filter criteria (opaque) @@ -348,10 +296,6 @@ class ARROW_FLIGHT_EXPORT FlightClient { arrow::Result> ListFlights( const FlightCallOptions& options, const Criteria& criteria); - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status ListFlights(const FlightCallOptions& options, const Criteria& criteria, - std::unique_ptr* listing); - /// \brief Given a flight ticket and schema, request to be sent the /// stream. Returns record batch stream reader /// \param[in] options Per-RPC options @@ -363,14 +307,6 @@ class ARROW_FLIGHT_EXPORT FlightClient { return DoGet({}, ticket); } - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status DoGet(const FlightCallOptions& options, const Ticket& ticket, - std::unique_ptr* stream); - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status DoGet(const Ticket& ticket, std::unique_ptr* stream) { - return DoGet({}, ticket).Value(stream); - } - /// \brief DoPut return value struct DoPutResult { /// \brief a writer to write record batches to @@ -399,21 +335,6 @@ class ARROW_FLIGHT_EXPORT FlightClient { return DoPut({}, descriptor, schema); } - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status DoPut(const FlightCallOptions& options, const FlightDescriptor& descriptor, - const std::shared_ptr& schema, - std::unique_ptr* writer, - std::unique_ptr* reader); - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status DoPut(const FlightDescriptor& descriptor, const std::shared_ptr& schema, - std::unique_ptr* writer, - std::unique_ptr* reader) { - ARROW_ASSIGN_OR_RAISE(auto output, DoPut({}, descriptor, schema)); - *writer = std::move(output.writer); - *reader = std::move(output.reader); - return Status::OK(); - } - struct DoExchangeResult { std::unique_ptr writer; std::unique_ptr reader; @@ -424,20 +345,6 @@ class ARROW_FLIGHT_EXPORT FlightClient { return DoExchange({}, descriptor); } - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status DoExchange(const FlightCallOptions& options, const FlightDescriptor& descriptor, - std::unique_ptr* writer, - std::unique_ptr* reader); - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status DoExchange(const FlightDescriptor& descriptor, - std::unique_ptr* writer, - std::unique_ptr* reader) { - ARROW_ASSIGN_OR_RAISE(auto output, DoExchange({}, descriptor)); - *writer = std::move(output.writer); - *reader = std::move(output.reader); - return Status::OK(); - } - /// \brief Explicitly shut down and clean up the client. /// /// For backwards compatibility, this will be implicitly called by diff --git a/cpp/src/arrow/flight/flight_internals_test.cc b/cpp/src/arrow/flight/flight_internals_test.cc index 87cd1ca887d5f..27c13ff949836 100644 --- a/cpp/src/arrow/flight/flight_internals_test.cc +++ b/cpp/src/arrow/flight/flight_internals_test.cc @@ -381,27 +381,6 @@ TEST(FlightTypes, LocationConstruction) { ASSERT_EQ(location.ToString(), "grpc+unix:///tmp/test.sock"); } -ARROW_SUPPRESS_DEPRECATION_WARNING -TEST(FlightTypes, DeprecatedLocationConstruction) { - Location location; - ASSERT_RAISES(Invalid, Location::Parse("This is not an URI", &location)); - ASSERT_RAISES(Invalid, - Location::ForGrpcTcp("This is not a hostname", 12345, &location)); - ASSERT_RAISES(Invalid, - Location::ForGrpcTls("This is not a hostname", 12345, &location)); - ASSERT_RAISES(Invalid, Location::ForGrpcUnix("This is not a filename", &location)); - - ASSERT_OK(Location::Parse("s3://test", &location)); - ASSERT_EQ(location.ToString(), "s3://test"); - ASSERT_OK(Location::ForGrpcTcp("localhost", 12345, &location)); - ASSERT_EQ(location.ToString(), "grpc+tcp://localhost:12345"); - ASSERT_OK(Location::ForGrpcTls("localhost", 12345, &location)); - ASSERT_EQ(location.ToString(), "grpc+tls://localhost:12345"); - ASSERT_OK(Location::ForGrpcUnix("/tmp/test.sock", &location)); - ASSERT_EQ(location.ToString(), "grpc+unix:///tmp/test.sock"); -} -ARROW_UNSUPPRESS_DEPRECATION_WARNING - // ---------------------------------------------------------------------- // Cookie authentication/middleware diff --git a/cpp/src/arrow/flight/server.h b/cpp/src/arrow/flight/server.h index 6fb8ab1213117..5c15d8d5645d1 100644 --- a/cpp/src/arrow/flight/server.h +++ b/cpp/src/arrow/flight/server.h @@ -53,18 +53,10 @@ class ARROW_FLIGHT_EXPORT FlightDataStream { /// \brief Compute FlightPayload containing serialized RecordBatch schema virtual arrow::Result GetSchemaPayload() = 0; - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status GetSchemaPayload(FlightPayload* payload) { - return GetSchemaPayload().Value(payload); - } - // When the stream is completed, the last payload written will have null // metadata virtual arrow::Result Next() = 0; - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status Next(FlightPayload* payload) { return Next().Value(payload); } - virtual Status Close(); }; diff --git a/cpp/src/arrow/flight/types.cc b/cpp/src/arrow/flight/types.cc index 44c0c6547cbb1..7c72595ed624b 100644 --- a/cpp/src/arrow/flight/types.cc +++ b/cpp/src/arrow/flight/types.cc @@ -158,11 +158,6 @@ arrow::Result> SchemaResult::Make(const Schema& sc return std::make_unique(std::move(schema_in)); } -Status SchemaResult::GetSchema(ipc::DictionaryMemo* dictionary_memo, - std::shared_ptr* out) const { - return GetSchema(dictionary_memo).Value(out); -} - std::string SchemaResult::ToString() const { return ""; } @@ -206,10 +201,6 @@ arrow::Result FlightDescriptor::SerializeToString() const { return out; } -Status FlightDescriptor::SerializeToString(std::string* out) const { - return SerializeToString().Value(out); -} - arrow::Result FlightDescriptor::Deserialize( std::string_view serialized) { pb::FlightDescriptor pb_descriptor; @@ -226,11 +217,6 @@ arrow::Result FlightDescriptor::Deserialize( return out; } -Status FlightDescriptor::Deserialize(const std::string& serialized, - FlightDescriptor* out) { - return Deserialize(serialized).Value(out); -} - std::string Ticket::ToString() const { std::stringstream ss; ss << ""; @@ -250,10 +236,6 @@ arrow::Result Ticket::SerializeToString() const { return out; } -Status Ticket::SerializeToString(std::string* out) const { - return SerializeToString().Value(out); -} - arrow::Result Ticket::Deserialize(std::string_view serialized) { pb::Ticket pb_ticket; if (serialized.size() > static_cast(std::numeric_limits::max())) { @@ -269,10 +251,6 @@ arrow::Result Ticket::Deserialize(std::string_view serialized) { return out; } -Status Ticket::Deserialize(const std::string& serialized, Ticket* out) { - return Deserialize(serialized).Value(out); -} - arrow::Result FlightInfo::Make(const Schema& schema, const FlightDescriptor& descriptor, const std::vector& endpoints, @@ -299,11 +277,6 @@ arrow::Result> FlightInfo::GetSchema( return schema_; } -Status FlightInfo::GetSchema(ipc::DictionaryMemo* dictionary_memo, - std::shared_ptr* out) const { - return GetSchema(dictionary_memo).Value(out); -} - arrow::Result FlightInfo::SerializeToString() const { pb::FlightInfo pb_info; RETURN_NOT_OK(internal::ToProto(*this, &pb_info)); @@ -315,10 +288,6 @@ arrow::Result FlightInfo::SerializeToString() const { return out; } -Status FlightInfo::SerializeToString(std::string* out) const { - return SerializeToString().Value(out); -} - arrow::Result> FlightInfo::Deserialize( std::string_view serialized) { pb::FlightInfo pb_info; @@ -335,11 +304,6 @@ arrow::Result> FlightInfo::Deserialize( return std::make_unique(std::move(data)); } -Status FlightInfo::Deserialize(const std::string& serialized, - std::unique_ptr* out) { - return Deserialize(serialized).Value(out); -} - std::string FlightInfo::ToString() const { std::stringstream ss; ss << " Location::ForGrpcTls(const std::string& host, const int port) { std::stringstream uri_string; uri_string << "grpc+tls://" << host << ':' << port; return Location::Parse(uri_string.str()); } -Status Location::ForGrpcTls(const std::string& host, const int port, Location* location) { - return ForGrpcTls(host, port).Value(location); -} - arrow::Result Location::ForGrpcUnix(const std::string& path) { std::stringstream uri_string; uri_string << "grpc+unix://" << path; return Location::Parse(uri_string.str()); } -Status Location::ForGrpcUnix(const std::string& path, Location* location) { - return ForGrpcUnix(path).Value(location); -} - arrow::Result Location::ForScheme(const std::string& scheme, const std::string& host, const int port) { std::stringstream uri_string; @@ -808,8 +752,6 @@ std::ostream& operator<<(std::ostream& os, CancelStatus status) { return os; } -Status ResultStream::Next(std::unique_ptr* info) { return Next().Value(info); } - Status ResultStream::Drain() { while (true) { ARROW_ASSIGN_OR_RAISE(auto result, Next()); @@ -818,10 +760,6 @@ Status ResultStream::Drain() { return Status::OK(); } -Status MetadataRecordBatchReader::Next(FlightStreamChunk* next) { - return Next().Value(next); -} - arrow::Result>> MetadataRecordBatchReader::ToRecordBatches() { std::vector> batches; @@ -833,21 +771,12 @@ MetadataRecordBatchReader::ToRecordBatches() { return batches; } -Status MetadataRecordBatchReader::ReadAll( - std::vector>* batches) { - return ToRecordBatches().Value(batches); -} - arrow::Result> MetadataRecordBatchReader::ToTable() { ARROW_ASSIGN_OR_RAISE(auto batches, ToRecordBatches()); ARROW_ASSIGN_OR_RAISE(auto schema, GetSchema()); return Table::FromRecordBatches(schema, std::move(batches)); } -Status MetadataRecordBatchReader::ReadAll(std::shared_ptr
* table) { - return ToTable().Value(table); -} - Status MetadataRecordBatchWriter::Begin(const std::shared_ptr& schema) { return Begin(schema, ipc::IpcWriteOptions::Defaults()); } @@ -934,10 +863,6 @@ arrow::Result BasicAuth::Deserialize(std::string_view serialized) { return out; } -Status BasicAuth::Deserialize(const std::string& serialized, BasicAuth* out) { - return Deserialize(serialized).Value(out); -} - arrow::Result BasicAuth::SerializeToString() const { pb::BasicAuth pb_result; RETURN_NOT_OK(internal::ToProto(*this, &pb_result)); @@ -948,8 +873,5 @@ arrow::Result BasicAuth::SerializeToString() const { return out; } -Status BasicAuth::Serialize(const BasicAuth& basic_auth, std::string* out) { - return basic_auth.SerializeToString().Value(out); -} } // namespace flight } // namespace arrow diff --git a/cpp/src/arrow/flight/types.h b/cpp/src/arrow/flight/types.h index 3cca774314017..ca86c27e86976 100644 --- a/cpp/src/arrow/flight/types.h +++ b/cpp/src/arrow/flight/types.h @@ -311,12 +311,6 @@ struct ARROW_FLIGHT_EXPORT BasicAuth { static arrow::Result Deserialize(std::string_view serialized); /// \brief Serialize this message to its wire-format representation. arrow::Result SerializeToString() const; - - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - static Status Deserialize(const std::string& serialized, BasicAuth* out); - - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - static Status Serialize(const BasicAuth& basic_auth, std::string* out); }; /// \brief A request to retrieve or generate a dataset @@ -349,18 +343,12 @@ struct ARROW_FLIGHT_EXPORT FlightDescriptor { /// services) that may want to return Flight types. arrow::Result SerializeToString() const; - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status SerializeToString(std::string* out) const; - /// \brief Parse the wire-format representation of this type. /// /// Useful when interoperating with non-Flight systems (e.g. REST /// services) that may want to return Flight types. static arrow::Result Deserialize(std::string_view serialized); - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - static Status Deserialize(const std::string& serialized, FlightDescriptor* out); - // Convenience factory functions static FlightDescriptor Command(const std::string& c) { @@ -400,17 +388,11 @@ struct ARROW_FLIGHT_EXPORT Ticket { /// services) that may want to return Flight types. arrow::Result SerializeToString() const; - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status SerializeToString(std::string* out) const; - /// \brief Parse the wire-format representation of this type. /// /// Useful when interoperating with non-Flight systems (e.g. REST /// services) that may want to return Flight types. static arrow::Result Deserialize(std::string_view serialized); - - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - static Status Deserialize(const std::string& serialized, Ticket* out); }; class FlightClient; @@ -434,9 +416,6 @@ struct ARROW_FLIGHT_EXPORT Location { /// \brief Initialize a location by parsing a URI string static arrow::Result Parse(const std::string& uri_string); - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - static Status Parse(const std::string& uri_string, Location* location); - /// \brief Initialize a location for a non-TLS, gRPC-based Flight /// service from a host and port /// \param[in] host The hostname to connect to @@ -444,9 +423,6 @@ struct ARROW_FLIGHT_EXPORT Location { /// \return Arrow result with the resulting location static arrow::Result ForGrpcTcp(const std::string& host, const int port); - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - static Status ForGrpcTcp(const std::string& host, const int port, Location* location); - /// \brief Initialize a location for a TLS-enabled, gRPC-based Flight /// service from a host and port /// \param[in] host The hostname to connect to @@ -454,18 +430,12 @@ struct ARROW_FLIGHT_EXPORT Location { /// \return Arrow result with the resulting location static arrow::Result ForGrpcTls(const std::string& host, const int port); - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - static Status ForGrpcTls(const std::string& host, const int port, Location* location); - /// \brief Initialize a location for a domain socket-based Flight /// service /// \param[in] path The path to the domain socket /// \return Arrow result with the resulting location static arrow::Result ForGrpcUnix(const std::string& path); - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - static Status ForGrpcUnix(const std::string& path, Location* location); - /// \brief Initialize a location based on a URI scheme static arrow::Result ForScheme(const std::string& scheme, const std::string& host, const int port); @@ -576,10 +546,6 @@ struct ARROW_FLIGHT_EXPORT SchemaResult { arrow::Result> GetSchema( ipc::DictionaryMemo* dictionary_memo) const; - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status GetSchema(ipc::DictionaryMemo* dictionary_memo, - std::shared_ptr* out) const; - const std::string& serialized_schema() const { return raw_schema_; } std::string ToString() const; @@ -633,10 +599,6 @@ class ARROW_FLIGHT_EXPORT FlightInfo { arrow::Result> GetSchema( ipc::DictionaryMemo* dictionary_memo) const; - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status GetSchema(ipc::DictionaryMemo* dictionary_memo, - std::shared_ptr* out) const; - const std::string& serialized_schema() const { return data_.schema; } /// The descriptor associated with this flight, may not be set @@ -661,9 +623,6 @@ class ARROW_FLIGHT_EXPORT FlightInfo { /// services) that may want to return Flight types. arrow::Result SerializeToString() const; - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status SerializeToString(std::string* out) const; - /// \brief Parse the wire-format representation of this type. /// /// Useful when interoperating with non-Flight systems (e.g. REST @@ -671,10 +630,6 @@ class ARROW_FLIGHT_EXPORT FlightInfo { static arrow::Result> Deserialize( std::string_view serialized); - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - static Status Deserialize(const std::string& serialized, - std::unique_ptr* out); - std::string ToString() const; /// Compare two FlightInfo for equality. This will compare the @@ -727,9 +682,6 @@ class ARROW_FLIGHT_EXPORT FlightListing { /// \return Arrow result with a single FlightInfo. Set to \a nullptr if there /// are none left. virtual arrow::Result> Next() = 0; - - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status Next(std::unique_ptr* info); }; /// \brief An iterator to Result instances returned by DoAction. @@ -741,9 +693,6 @@ class ARROW_FLIGHT_EXPORT ResultStream { /// \return Arrow result with a single Result. Set to \a nullptr if there are none left. virtual arrow::Result> Next() = 0; - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status Next(std::unique_ptr* info); - /// \brief Read and drop the remaining messages to get the error (if any) from a server. /// \return Status OK if this is no error from a server, any other status if a /// server returns an error. @@ -770,20 +719,11 @@ class ARROW_FLIGHT_EXPORT MetadataRecordBatchReader { /// nullptr. virtual arrow::Result Next() = 0; - ARROW_DEPRECATED("Deprecated in 8.0.0. Use Result-returning overload instead.") - Status Next(FlightStreamChunk* next); - /// \brief Consume entire stream as a vector of record batches virtual arrow::Result>> ToRecordBatches(); - ARROW_DEPRECATED("Deprecated in 8.0.0. Use ToRecordBatches instead.") - Status ReadAll(std::vector>* batches); - /// \brief Consume entire stream as a Table virtual arrow::Result> ToTable(); - - ARROW_DEPRECATED("Deprecated in 8.0.0. Use ToTable instead.") - Status ReadAll(std::shared_ptr
* table); }; /// \brief Convert a MetadataRecordBatchReader to a regular RecordBatchReader. diff --git a/cpp/src/arrow/io/interfaces.h b/cpp/src/arrow/io/interfaces.h index c5355c9422756..dcbe4feb261fb 100644 --- a/cpp/src/arrow/io/interfaces.h +++ b/cpp/src/arrow/io/interfaces.h @@ -96,10 +96,6 @@ struct ARROW_EXPORT IOContext { StopToken stop_token_; }; -struct ARROW_DEPRECATED("renamed to IOContext in 4.0.0") AsyncContext : public IOContext { - using IOContext::IOContext; -}; - class ARROW_EXPORT FileInterface { public: virtual ~FileInterface() = 0; diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc index 3ce99d6f84a40..683e72878b9b1 100644 --- a/cpp/src/arrow/record_batch.cc +++ b/cpp/src/arrow/record_batch.cc @@ -348,19 +348,11 @@ Result RecordBatchReader::ToRecordBatches() { return batches; } -Status RecordBatchReader::ReadAll(RecordBatchVector* batches) { - return ToRecordBatches().Value(batches); -} - Result> RecordBatchReader::ToTable() { ARROW_ASSIGN_OR_RAISE(auto batches, ToRecordBatches()); return Table::FromRecordBatches(schema(), std::move(batches)); } -Status RecordBatchReader::ReadAll(std::shared_ptr
* table) { - return ToTable().Value(table); -} - class SimpleRecordBatchReader : public RecordBatchReader { public: SimpleRecordBatchReader(Iterator> it, diff --git a/cpp/src/arrow/record_batch.h b/cpp/src/arrow/record_batch.h index 8f9b5882d93ac..d728d5eb0da2f 100644 --- a/cpp/src/arrow/record_batch.h +++ b/cpp/src/arrow/record_batch.h @@ -326,15 +326,9 @@ class ARROW_EXPORT RecordBatchReader { /// \brief Consume entire stream as a vector of record batches Result ToRecordBatches(); - ARROW_DEPRECATED("Deprecated in 8.0.0. Use ToRecordBatches instead.") - Status ReadAll(RecordBatchVector* batches); - /// \brief Read all batches and concatenate as arrow::Table Result> ToTable(); - ARROW_DEPRECATED("Deprecated in 8.0.0. Use ToTable instead.") - Status ReadAll(std::shared_ptr
* table); - /// \brief Create a RecordBatchReader from a vector of RecordBatch. /// /// \param[in] batches the vector of RecordBatch to read from diff --git a/cpp/src/arrow/record_batch_test.cc b/cpp/src/arrow/record_batch_test.cc index def6dbc54edcb..4975e94325d32 100644 --- a/cpp/src/arrow/record_batch_test.cc +++ b/cpp/src/arrow/record_batch_test.cc @@ -496,32 +496,4 @@ TEST_F(TestRecordBatchReader, ToTable) { ASSERT_EQ(table->column(0)->chunks().size(), 0); } -ARROW_SUPPRESS_DEPRECATION_WARNING -TEST_F(TestRecordBatchReader, DeprecatedReadAllToRecordBatches) { - RecordBatchVector batches; - ASSERT_OK(reader_->ReadAll(&batches)); - ASSERT_EQ(batches.size(), batches_.size()); - for (size_t index = 0; index < batches.size(); index++) { - AssertBatchesEqual(*batches[index], *batches_[index]); - } - - ASSERT_OK(reader_->ReadAll(&batches)); - ASSERT_EQ(batches.size(), 0); -} - -TEST_F(TestRecordBatchReader, DeprecatedReadAllToTable) { - std::shared_ptr
table; - - ASSERT_OK(reader_->ReadAll(&table)); - const auto& chunks = table->column(0)->chunks(); - ASSERT_EQ(chunks.size(), batches_.size()); - for (size_t index = 0; index < batches_.size(); index++) { - AssertArraysEqual(*chunks[index], *batches_[index]->column(0)); - } - - ASSERT_OK(reader_->ReadAll(&table)); - ASSERT_EQ(table->column(0)->chunks().size(), 0); -} -ARROW_UNSUPPRESS_DEPRECATION_WARNING - } // namespace arrow diff --git a/cpp/src/arrow/table_builder.cc b/cpp/src/arrow/table_builder.cc index 414aa263cc7f0..19ca151ac200f 100644 --- a/cpp/src/arrow/table_builder.cc +++ b/cpp/src/arrow/table_builder.cc @@ -36,19 +36,6 @@ RecordBatchBuilder::RecordBatchBuilder(const std::shared_ptr& schema, MemoryPool* pool, int64_t initial_capacity) : schema_(schema), initial_capacity_(initial_capacity), pool_(pool) {} -Status RecordBatchBuilder::Make(const std::shared_ptr& schema, MemoryPool* pool, - std::unique_ptr* builder) { - ARROW_ASSIGN_OR_RAISE(*builder, Make(schema, pool, kMinBuilderCapacity)) - return Status::OK(); -} - -Status RecordBatchBuilder::Make(const std::shared_ptr& schema, MemoryPool* pool, - int64_t initial_capacity, - std::unique_ptr* builder) { - ARROW_ASSIGN_OR_RAISE(*builder, Make(schema, pool, initial_capacity)) - return Status::OK(); -} - Result> RecordBatchBuilder::Make( const std::shared_ptr& schema, MemoryPool* pool) { return Make(schema, pool, kMinBuilderCapacity); @@ -63,17 +50,6 @@ Result> RecordBatchBuilder::Make( return std::move(builder); } -Status RecordBatchBuilder::Flush(bool reset_builders, - std::shared_ptr* batch) { - ARROW_ASSIGN_OR_RAISE(*batch, Flush(reset_builders)); - return Status::OK(); -} - -Status RecordBatchBuilder::Flush(std::shared_ptr* batch) { - ARROW_ASSIGN_OR_RAISE(*batch, Flush(true)); - return Status::OK(); -} - Result> RecordBatchBuilder::Flush(bool reset_builders) { std::vector> fields; fields.resize(this->num_fields()); diff --git a/cpp/src/arrow/table_builder.h b/cpp/src/arrow/table_builder.h index 65ebd86ea416e..671cc4ab97996 100644 --- a/cpp/src/arrow/table_builder.h +++ b/cpp/src/arrow/table_builder.h @@ -38,24 +38,6 @@ class RecordBatch; /// schema class ARROW_EXPORT RecordBatchBuilder { public: - /// \brief Create and initialize a RecordBatchBuilder - /// \param[in] schema The schema for the record batch - /// \param[in] pool A MemoryPool to use for allocations - /// \param[in] builder the created builder instance - ARROW_DEPRECATED("Deprecated in 9.0.0. Use Result-returning variant.") - static Status Make(const std::shared_ptr& schema, MemoryPool* pool, - std::unique_ptr* builder); - - /// \brief Create and initialize a RecordBatchBuilder - /// \param[in] schema The schema for the record batch - /// \param[in] pool A MemoryPool to use for allocations - /// \param[in] initial_capacity The initial capacity for the builders - /// \param[in] builder the created builder instance - ARROW_DEPRECATED("Deprecated in 9.0.0. Use Result-returning variant.") - static Status Make(const std::shared_ptr& schema, MemoryPool* pool, - int64_t initial_capacity, - std::unique_ptr* builder); - /// \brief Create and initialize a RecordBatchBuilder /// \param[in] schema The schema for the record batch /// \param[in] pool A MemoryPool to use for allocations @@ -84,19 +66,6 @@ class ARROW_EXPORT RecordBatchBuilder { return internal::checked_cast(raw_field_builders_[i]); } - /// \brief Finish current batch and optionally reset - /// \param[in] reset_builders the resulting RecordBatch - /// \param[out] batch the resulting RecordBatch - /// \return Status - ARROW_DEPRECATED("Deprecated in 9.0.0. Use Result-returning variant.") - Status Flush(bool reset_builders, std::shared_ptr* batch); - - /// \brief Finish current batch and reset - /// \param[out] batch the resulting RecordBatch - /// \return Status - ARROW_DEPRECATED("Deprecated in 9.0.0. Use Result-returning variant.") - Status Flush(std::shared_ptr* batch); - /// \brief Finish current batch and optionally reset /// \param[in] reset_builders the resulting RecordBatch /// \return the resulting RecordBatch diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 855fb5a5a4882..142119b770b8c 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -259,16 +259,6 @@ class FileReaderImpl : public FileReader { reader_->metadata()->key_value_metadata(), out); } - Status ReadSchemaField(int i, std::shared_ptr* out) override { - auto included_leaves = VectorToSharedSet(Iota(reader_->metadata()->num_columns())); - std::vector row_groups = Iota(reader_->metadata()->num_row_groups()); - - std::unique_ptr reader; - RETURN_NOT_OK(GetFieldReader(i, included_leaves, row_groups, &reader)); - - return ReadColumn(i, row_groups, reader.get(), out); - } - Status ReadColumn(int i, const std::vector& row_groups, ColumnReader* reader, std::shared_ptr* out) { BEGIN_PARQUET_CATCH_EXCEPTIONS diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h index 2cbd36176f5e3..6e46ca43f7b18 100644 --- a/cpp/src/parquet/arrow/reader.h +++ b/cpp/src/parquet/arrow/reader.h @@ -141,26 +141,19 @@ class PARQUET_EXPORT FileReader { /// \brief Read column as a whole into a chunked array. /// - /// The indicated column index is relative to the schema + /// The index i refers the index of the top level schema field, which may + /// be nested or flat - e.g. + /// + /// 0 foo.bar + /// foo.bar.baz + /// foo.qux + /// 1 foo2 + /// 2 foo3 + /// + /// i=0 will read the entire foo struct, i=1 the foo2 primitive column etc virtual ::arrow::Status ReadColumn(int i, std::shared_ptr<::arrow::ChunkedArray>* out) = 0; - // NOTE: Experimental API - // Reads a specific top level schema field into an Array - // The index i refers the index of the top level schema field, which may - // be nested or flat - e.g. - // - // 0 foo.bar - // foo.bar.baz - // foo.qux - // 1 foo2 - // 2 foo3 - // - // i=0 will read the entire foo struct, i=1 the foo2 primitive column etc - ARROW_DEPRECATED("Deprecated in 9.0.0. Use ReadColumn instead.") - virtual ::arrow::Status ReadSchemaField( - int i, std::shared_ptr<::arrow::ChunkedArray>* out) = 0; - /// \brief Return a RecordBatchReader of all row groups and columns. virtual ::arrow::Status GetRecordBatchReader( std::unique_ptr<::arrow::RecordBatchReader>* out) = 0; From 80f77d1371ef93c946b08c3c2a05d269c8548c74 Mon Sep 17 00:00:00 2001 From: Jin Shang Date: Wed, 19 Jul 2023 16:52:53 +0800 Subject: [PATCH 011/749] GH-36762: [Dev] Remove only component labels when an issue is updated (#36763) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change A follow-up of https://github.com/apache/arrow/pull/36723#issuecomment-1640816772. ### What changes are included in this PR? Non-component labels should be kept when the issue is updated. ### Are these changes tested? It is tested on my fork. See https://github.com/js8544/arrow/issues/2 ### Are there any user-facing changes? No. * Closes: #36762 Authored-by: Jin Shang Signed-off-by: Raúl Cumplido --- .github/workflows/issue_bot.yml | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/.github/workflows/issue_bot.yml b/.github/workflows/issue_bot.yml index 7a62f2149662e..ae344a4c1eba9 100644 --- a/.github/workflows/issue_bot.yml +++ b/.github/workflows/issue_bot.yml @@ -39,6 +39,20 @@ jobs: let split_body = context.payload.issue.body.split('### Component(s)'); if (split_body.length != 2) throw new Error('No components found!'); + let current_labels = await github.rest.issues.listLabelsOnIssue({ + "owner": context.repo.owner, + "repo": context.repo.repo, + "per_page": 100, + "issue_number": context.payload.issue.number, + }); + + let current_label_names = current_labels.data.map(label => label.name); + + // keep non-component labels + let non_component_labels = current_label_names.filter( + label => !label.startsWith("Component: ") + ); + let component_labels = split_body[1] .split(',') .map(component => component.trim()) @@ -61,5 +75,5 @@ jobs: "owner": context.repo.owner, "repo": context.repo.repo, "issue_number": context.payload.issue.number, - "labels": component_labels, + "labels": component_labels.concat(non_component_labels), }); From 366e8083a2bd6d24ad371548699ef936fb7bb468 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 19 Jul 2023 11:19:49 +0200 Subject: [PATCH 012/749] GH-35116: [CI][C++] Enable compile-time AVX2 on some CI platforms (#36662) AVX2 became mainline on Intel and AMD server CPUs around 2015, so it's unlikely to be unavailable on current cloud platforms: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions#CPUs_with_AVX2 Enable it at least on one Windows and one Linux CI platform. x86 macOS is a legacy platform, so less interesting to exercise there (and I'm not sure the old CPUs in x86 Macs actually support AVX2). Also, fix the buggy AVX2 activation logic in Acero and avoid force-testing AVX2 on incompatible systems. * Closes: #35116 Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- .github/workflows/cpp.yml | 14 ++- ci/scripts/cpp_build.sh | 1 + cpp/cmake_modules/SetupCxxFlags.cmake | 49 +++++----- cpp/src/arrow/CMakeLists.txt | 22 ++--- cpp/src/arrow/acero/CMakeLists.txt | 6 +- cpp/src/arrow/acero/bloom_filter.cc | 8 +- cpp/src/arrow/acero/bloom_filter.h | 5 +- cpp/src/arrow/acero/bloom_filter_avx2.cc | 5 +- cpp/src/arrow/acero/bloom_filter_test.cc | 85 ++++++++--------- cpp/src/arrow/acero/swiss_join_avx2.cc | 4 - cpp/src/arrow/acero/swiss_join_internal.h | 2 +- cpp/src/arrow/acero/test_util_internal.cc | 9 ++ cpp/src/arrow/acero/test_util_internal.h | 12 ++- cpp/src/arrow/compute/kernels/CMakeLists.txt | 54 ++++++++--- cpp/src/arrow/compute/key_hash.cc | 6 +- cpp/src/arrow/compute/key_hash.h | 4 +- cpp/src/arrow/compute/key_hash_avx2.cc | 4 - cpp/src/arrow/compute/key_hash_test.cc | 92 ++++++++++--------- cpp/src/arrow/compute/key_map.cc | 4 +- cpp/src/arrow/compute/key_map.h | 2 +- cpp/src/arrow/compute/key_map_avx2.cc | 4 - cpp/src/arrow/compute/row/compare_internal.cc | 8 +- cpp/src/arrow/compute/row/compare_internal.h | 2 +- .../compute/row/compare_internal_avx2.cc | 4 - cpp/src/arrow/compute/row/encode_internal.cc | 10 +- cpp/src/arrow/compute/row/encode_internal.h | 6 +- .../arrow/compute/row/encode_internal_avx2.cc | 4 - cpp/src/arrow/compute/util.cc | 10 +- cpp/src/arrow/compute/util.h | 2 +- cpp/src/arrow/compute/util_avx2.cc | 8 +- cpp/src/arrow/testing/util.cc | 15 +++ cpp/src/arrow/testing/util.h | 6 ++ cpp/src/arrow/util/byte_stream_split.h | 28 +++--- docker-compose.yml | 30 +++--- 34 files changed, 289 insertions(+), 236 deletions(-) diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index 67435566ce305..63a16c8c114ba 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -67,7 +67,8 @@ jobs: image: conda-cpp llvm: "14" runs-on: ubuntu-latest - title: AMD64 Conda C++ + simd-level: AVX2 + title: AMD64 Conda C++ AVX2 ubuntu: "22.04" - arch: amd64 clang-tools: "14" @@ -85,6 +86,7 @@ jobs: ubuntu: "20.04" env: ARCH: ${{ matrix.arch }} + ARROW_SIMD_LEVEL: ${{ matrix.simd-level }} CLANG_TOOLS: ${{ matrix.clang-tools }} LLVM: ${{ matrix.llvm }} UBUNTU: ${{ matrix.ubuntu }} @@ -175,6 +177,10 @@ jobs: ARROW_WITH_ZSTD: ON GTest_SOURCE: BUNDLED steps: + - name: CPU Info + run: | + sysctl -a | grep cpu + sysctl -a | grep "hw.optional" - name: Checkout Arrow uses: actions/checkout@v3 with: @@ -220,7 +226,7 @@ jobs: ci/scripts/cpp_test.sh $(pwd) $(pwd)/build windows: - name: AMD64 ${{ matrix.name }} C++17 + name: ${{ matrix.title }} runs-on: ${{ matrix.os }} if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 60 @@ -231,7 +237,8 @@ jobs: - windows-2019 include: - os: windows-2019 - name: Windows 2019 + simd-level: AVX2 + title: AMD64 Windows 2019 C++17 AVX2 env: ARROW_BOOST_USE_SHARED: OFF ARROW_BUILD_BENCHMARKS: ON @@ -246,6 +253,7 @@ jobs: ARROW_MIMALLOC: ON ARROW_ORC: ON ARROW_PARQUET: ON + ARROW_SIMD_LEVEL: ${{ matrix.simd-level }} ARROW_USE_GLOG: OFF ARROW_VERBOSE_THIRDPARTY_BUILD: OFF ARROW_WITH_BROTLI: OFF diff --git a/ci/scripts/cpp_build.sh b/ci/scripts/cpp_build.sh index f0f893c419616..e53b3fa460915 100755 --- a/ci/scripts/cpp_build.sh +++ b/ci/scripts/cpp_build.sh @@ -126,6 +126,7 @@ cmake \ -DARROW_PARQUET=${ARROW_PARQUET:-OFF} \ -DARROW_RUNTIME_SIMD_LEVEL=${ARROW_RUNTIME_SIMD_LEVEL:-MAX} \ -DARROW_S3=${ARROW_S3:-OFF} \ + -DARROW_SIMD_LEVEL=${ARROW_SIMD_LEVEL:-DEFAULT} \ -DARROW_SKYHOOK=${ARROW_SKYHOOK:-OFF} \ -DARROW_SUBSTRAIT=${ARROW_SUBSTRAIT:-ON} \ -DARROW_TEST_LINKAGE=${ARROW_TEST_LINKAGE:-shared} \ diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake index 076c2e7450798..6b47fcb717287 100644 --- a/cpp/cmake_modules/SetupCxxFlags.cmake +++ b/cpp/cmake_modules/SetupCxxFlags.cmake @@ -62,29 +62,32 @@ if(ARROW_CPU_FLAG STREQUAL "x86") "${ARROW_AVX512_FLAG} -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw") check_cxx_compiler_flag(${ARROW_SSE4_2_FLAG} CXX_SUPPORTS_SSE4_2) endif() - check_cxx_compiler_flag(${ARROW_AVX2_FLAG} CXX_SUPPORTS_AVX2) - if(MINGW) - # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65782 - message(STATUS "Disable AVX512 support on MINGW for now") - else() - # Check for AVX512 support in the compiler. - set(OLD_CMAKE_REQURED_FLAGS ${CMAKE_REQUIRED_FLAGS}) - set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${ARROW_AVX512_FLAG}") - check_cxx_source_compiles(" - #ifdef _MSC_VER - #include - #else - #include - #endif - - int main() { - __m512i mask = _mm512_set1_epi32(0x1); - char out[32]; - _mm512_storeu_si512(out, mask); - return 0; - }" - CXX_SUPPORTS_AVX512) - set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQURED_FLAGS}) + if(CMAKE_SIZEOF_VOID_P EQUAL 8) + # Check for AVX extensions on 64-bit systems only, as 32-bit support seems iffy + check_cxx_compiler_flag(${ARROW_AVX2_FLAG} CXX_SUPPORTS_AVX2) + if(MINGW) + # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65782 + message(STATUS "Disable AVX512 support on MINGW for now") + else() + # Check for AVX512 support in the compiler. + set(OLD_CMAKE_REQURED_FLAGS ${CMAKE_REQUIRED_FLAGS}) + set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${ARROW_AVX512_FLAG}") + check_cxx_source_compiles(" + #ifdef _MSC_VER + #include + #else + #include + #endif + + int main() { + __m512i mask = _mm512_set1_epi32(0x1); + char out[32]; + _mm512_storeu_si512(out, mask); + return 0; + }" + CXX_SUPPORTS_AVX512) + set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQURED_FLAGS}) + endif() endif() # Runtime SIMD level it can get from compiler and ARROW_RUNTIME_SIMD_LEVEL if(CXX_SUPPORTS_SSE4_2 AND ARROW_RUNTIME_SIMD_LEVEL MATCHES diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index fccff6c8cf1a9..a398e790de14b 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -119,7 +119,7 @@ function(ADD_ARROW_BENCHMARK REL_TEST_NAME) ${ARG_UNPARSED_ARGUMENTS}) endfunction() -macro(append_avx2_src SRC) +macro(append_runtime_avx2_src SRC) if(ARROW_HAVE_RUNTIME_AVX2) list(APPEND ARROW_SRCS ${SRC}) set_source_files_properties(${SRC} PROPERTIES SKIP_PRECOMPILE_HEADERS ON) @@ -127,7 +127,7 @@ macro(append_avx2_src SRC) endif() endmacro() -macro(append_avx512_src SRC) +macro(append_runtime_avx512_src SRC) if(ARROW_HAVE_RUNTIME_AVX512) list(APPEND ARROW_SRCS ${SRC}) set_source_files_properties(${SRC} PROPERTIES SKIP_PRECOMPILE_HEADERS ON) @@ -254,8 +254,8 @@ if(ARROW_JEMALLOC) PROPERTIES SKIP_UNITY_BUILD_INCLUSION ON) endif() -append_avx2_src(util/bpacking_avx2.cc) -append_avx512_src(util/bpacking_avx512.cc) +append_runtime_avx2_src(util/bpacking_avx2.cc) +append_runtime_avx512_src(util/bpacking_avx512.cc) if(ARROW_HAVE_NEON) list(APPEND ARROW_SRCS util/bpacking_neon.cc) @@ -425,11 +425,11 @@ list(APPEND compute/row/row_internal.cc compute/util.cc) -append_avx2_src(compute/key_hash_avx2.cc) -append_avx2_src(compute/key_map_avx2.cc) -append_avx2_src(compute/row/compare_internal_avx2.cc) -append_avx2_src(compute/row/encode_internal_avx2.cc) -append_avx2_src(compute/util_avx2.cc) +append_runtime_avx2_src(compute/key_hash_avx2.cc) +append_runtime_avx2_src(compute/key_map_avx2.cc) +append_runtime_avx2_src(compute/row/compare_internal_avx2.cc) +append_runtime_avx2_src(compute/row/encode_internal_avx2.cc) +append_runtime_avx2_src(compute/util_avx2.cc) if(ARROW_COMPUTE) # Include the remaining kernels @@ -464,8 +464,8 @@ if(ARROW_COMPUTE) compute/kernels/vector_select_k.cc compute/kernels/vector_sort.cc) - append_avx2_src(compute/kernels/aggregate_basic_avx2.cc) - append_avx512_src(compute/kernels/aggregate_basic_avx512.cc) + append_runtime_avx2_src(compute/kernels/aggregate_basic_avx2.cc) + append_runtime_avx512_src(compute/kernels/aggregate_basic_avx512.cc) endif() if(ARROW_FILESYSTEM) diff --git a/cpp/src/arrow/acero/CMakeLists.txt b/cpp/src/arrow/acero/CMakeLists.txt index 287884432b9fe..c2c91db58d38a 100644 --- a/cpp/src/arrow/acero/CMakeLists.txt +++ b/cpp/src/arrow/acero/CMakeLists.txt @@ -19,7 +19,7 @@ add_custom_target(arrow_acero) arrow_install_all_headers("arrow/acero") -macro(append_acero_avx2_src SRC) +macro(append_acero_runtime_avx2_src SRC) if(ARROW_HAVE_RUNTIME_AVX2) list(APPEND ARROW_ACERO_SRCS ${SRC}) set_source_files_properties(${SRC} PROPERTIES SKIP_PRECOMPILE_HEADERS ON) @@ -56,8 +56,8 @@ set(ARROW_ACERO_SRCS union_node.cc util.cc) -append_acero_avx2_src(bloom_filter_avx2.cc) -append_acero_avx2_src(swiss_join_avx2.cc) +append_acero_runtime_avx2_src(bloom_filter_avx2.cc) +append_acero_runtime_avx2_src(swiss_join_avx2.cc) set(ARROW_ACERO_SHARED_LINK_LIBS) set(ARROW_ACERO_STATIC_LINK_LIBS) diff --git a/cpp/src/arrow/acero/bloom_filter.cc b/cpp/src/arrow/acero/bloom_filter.cc index ad5e66ded0613..b9855ee506d27 100644 --- a/cpp/src/arrow/acero/bloom_filter.cc +++ b/cpp/src/arrow/acero/bloom_filter.cc @@ -123,7 +123,7 @@ void BlockedBloomFilter::InsertImp(int64_t num_rows, const T* hashes) { void BlockedBloomFilter::Insert(int64_t hardware_flags, int64_t num_rows, const uint32_t* hashes) { int64_t num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (hardware_flags & arrow::internal::CpuInfo::AVX2) { num_processed = Insert_avx2(num_rows, hashes); } @@ -134,7 +134,7 @@ void BlockedBloomFilter::Insert(int64_t hardware_flags, int64_t num_rows, void BlockedBloomFilter::Insert(int64_t hardware_flags, int64_t num_rows, const uint64_t* hashes) { int64_t num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (hardware_flags & arrow::internal::CpuInfo::AVX2) { num_processed = Insert_avx2(num_rows, hashes); } @@ -181,7 +181,7 @@ void BlockedBloomFilter::Find(int64_t hardware_flags, int64_t num_rows, bool enable_prefetch) const { int64_t num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (!(enable_prefetch && UsePrefetch()) && (hardware_flags & arrow::internal::CpuInfo::AVX2)) { num_processed = Find_avx2(num_rows, hashes, result_bit_vector); @@ -202,7 +202,7 @@ void BlockedBloomFilter::Find(int64_t hardware_flags, int64_t num_rows, bool enable_prefetch) const { int64_t num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (!(enable_prefetch && UsePrefetch()) && (hardware_flags & arrow::internal::CpuInfo::AVX2)) { num_processed = Find_avx2(num_rows, hashes, result_bit_vector); diff --git a/cpp/src/arrow/acero/bloom_filter.h b/cpp/src/arrow/acero/bloom_filter.h index b8f7f8cd256b1..50d07bfd948e0 100644 --- a/cpp/src/arrow/acero/bloom_filter.h +++ b/cpp/src/arrow/acero/bloom_filter.h @@ -17,13 +17,14 @@ #pragma once -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) #include #endif #include #include #include + #include "arrow/acero/partition_util.h" #include "arrow/acero/util.h" #include "arrow/memory_pool.h" @@ -203,7 +204,7 @@ class ARROW_ACERO_EXPORT BlockedBloomFilter { void SingleFold(int num_folds); -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) inline __m256i mask_avx2(__m256i hash) const; inline __m256i block_id_avx2(__m256i hash) const; int64_t Insert_avx2(int64_t num_rows, const uint32_t* hashes); diff --git a/cpp/src/arrow/acero/bloom_filter_avx2.cc b/cpp/src/arrow/acero/bloom_filter_avx2.cc index b6c281276db8d..5816bb4fc0a32 100644 --- a/cpp/src/arrow/acero/bloom_filter_avx2.cc +++ b/cpp/src/arrow/acero/bloom_filter_avx2.cc @@ -16,14 +16,13 @@ // under the License. #include + #include "arrow/acero/bloom_filter.h" #include "arrow/util/bit_util.h" namespace arrow { namespace acero { -#if defined(ARROW_HAVE_AVX2) - inline __m256i BlockedBloomFilter::mask_avx2(__m256i hash) const { // AVX2 translation of mask() method // @@ -132,7 +131,5 @@ int64_t BlockedBloomFilter::Insert_avx2(int64_t num_rows, const uint64_t* hashes return InsertImp_avx2(num_rows, hashes); } -#endif - } // namespace acero } // namespace arrow diff --git a/cpp/src/arrow/acero/bloom_filter_test.cc b/cpp/src/arrow/acero/bloom_filter_test.cc index de433ac68c11a..95375e277e2b8 100644 --- a/cpp/src/arrow/acero/bloom_filter_test.cc +++ b/cpp/src/arrow/acero/bloom_filter_test.cc @@ -22,13 +22,13 @@ #include #include #include + #include "arrow/acero/bloom_filter.h" #include "arrow/acero/task_util.h" #include "arrow/acero/test_util_internal.h" #include "arrow/acero/util.h" #include "arrow/compute/key_hash.h" #include "arrow/util/bitmap_ops.h" -#include "arrow/util/cpu_info.h" namespace arrow { @@ -171,9 +171,7 @@ void TestBloomSmallHashHelper(int64_t num_input_hashes, const T* input_hashes, // Output FPR and build and probe cost. // void TestBloomSmall(BloomFilterBuildStrategy strategy, int64_t num_build, - int num_build_copies, bool use_simd, bool enable_prefetch) { - int64_t hardware_flags = use_simd ? ::arrow::internal::CpuInfo::AVX2 : 0; - + int num_build_copies, int64_t hardware_flags, bool enable_prefetch) { // Generate input keys // int64_t num_probe = 4 * num_build; @@ -324,10 +322,8 @@ void TestBloomLargeHashHelper(int64_t hardware_flags, int64_t block, // Test with larger size Bloom filters (use large prime with arithmetic // sequence modulo 2^64). // -void TestBloomLarge(BloomFilterBuildStrategy strategy, int64_t num_build, bool use_simd, - bool enable_prefetch) { - int64_t hardware_flags = use_simd ? ::arrow::internal::CpuInfo::AVX2 : 0; - +void TestBloomLarge(BloomFilterBuildStrategy strategy, int64_t num_build, + int64_t hardware_flags, bool enable_prefetch) { // Largest 63-bit prime constexpr uint64_t prime = 0x7FFFFFFFFFFFFFE7ULL; @@ -458,42 +454,40 @@ TEST(BloomFilter, Basic) { num_build.push_back(1LL << log_large); #endif - constexpr int num_param_sets = 3; - struct { - bool use_avx2; + struct TestParam { + int64_t hardware_flags; bool enable_prefetch; bool insert_multiple_copies; - } params[num_param_sets]; - for (int i = 0; i < num_param_sets; ++i) { - params[i].use_avx2 = (i == 1); - params[i].enable_prefetch = (i == 2); - params[i].insert_multiple_copies = (i == 3); + }; + std::vector test_params; + for (const auto hardware_flags : HardwareFlagsForTesting()) { + test_params.push_back({hardware_flags, false, false}); } + test_params.push_back({0, true, false}); + test_params.push_back({0, false, true}); - std::vector strategy; - strategy.push_back(BloomFilterBuildStrategy::SINGLE_THREADED); + std::vector strategies; + strategies.push_back(BloomFilterBuildStrategy::SINGLE_THREADED); #ifndef ARROW_VALGRIND - strategy.push_back(BloomFilterBuildStrategy::PARALLEL); + strategies.push_back(BloomFilterBuildStrategy::PARALLEL); #endif static constexpr int64_t min_rows_for_large = 2 * 1024 * 1024; - for (size_t istrategy = 0; istrategy < strategy.size(); ++istrategy) { - for (int iparam_set = 0; iparam_set < num_param_sets; ++iparam_set) { - ARROW_SCOPED_TRACE("%s ", params[iparam_set].use_avx2 ? "AVX2" - : params[iparam_set].enable_prefetch ? "PREFETCH" - : params[iparam_set].insert_multiple_copies ? "FOLDING" - : "REGULAR"); - for (size_t inum_build = 0; inum_build < num_build.size(); ++inum_build) { - ARROW_SCOPED_TRACE("num_build ", static_cast(num_build[inum_build])); - if (num_build[inum_build] >= min_rows_for_large) { - TestBloomLarge(strategy[istrategy], num_build[inum_build], - params[iparam_set].use_avx2, params[iparam_set].enable_prefetch); + for (const auto& strategy : strategies) { + for (const auto& test_param : test_params) { + ARROW_SCOPED_TRACE("hardware_flags = ", test_param.hardware_flags, + test_param.enable_prefetch ? " PREFETCH" : "", + test_param.insert_multiple_copies ? " FOLDING" : "REGULAR"); + for (const auto n : num_build) { + ARROW_SCOPED_TRACE("num_build ", n); + if (n >= min_rows_for_large) { + TestBloomLarge(strategy, n, test_param.hardware_flags, + test_param.enable_prefetch); } else { - TestBloomSmall(strategy[istrategy], num_build[inum_build], - params[iparam_set].insert_multiple_copies ? 8 : 1, - params[iparam_set].use_avx2, params[iparam_set].enable_prefetch); + TestBloomSmall(strategy, n, test_param.insert_multiple_copies ? 8 : 1, + test_param.hardware_flags, test_param.enable_prefetch); } } } @@ -506,19 +500,18 @@ TEST(BloomFilter, Scaling) { num_build.push_back(1000000); num_build.push_back(4000000); - std::vector strategy; - strategy.push_back(BloomFilterBuildStrategy::PARALLEL); - - for (bool use_avx2 : {false, true}) { - for (size_t istrategy = 0; istrategy < strategy.size(); ++istrategy) { - for (size_t inum_build = 0; inum_build < num_build.size(); ++inum_build) { - ARROW_SCOPED_TRACE("num_build = ", static_cast(num_build[inum_build])); - ARROW_SCOPED_TRACE("strategy = ", - strategy[istrategy] == BloomFilterBuildStrategy::PARALLEL - ? "PARALLEL" - : "SINGLE_THREADED"); - ARROW_SCOPED_TRACE("avx2 = ", use_avx2 ? "AVX2" : "SCALAR"); - TestBloomLarge(strategy[istrategy], num_build[inum_build], use_avx2, + std::vector strategies; + strategies.push_back(BloomFilterBuildStrategy::PARALLEL); + + for (const auto hardware_flags : HardwareFlagsForTesting()) { + for (const auto& strategy : strategies) { + for (const auto n : num_build) { + ARROW_SCOPED_TRACE("num_build = ", n); + ARROW_SCOPED_TRACE("strategy = ", strategy == BloomFilterBuildStrategy::PARALLEL + ? "PARALLEL" + : "SINGLE_THREADED"); + ARROW_SCOPED_TRACE("hardware_flags = ", hardware_flags); + TestBloomLarge(strategy, n, hardware_flags, /*enable_prefetch=*/false); } } diff --git a/cpp/src/arrow/acero/swiss_join_avx2.cc b/cpp/src/arrow/acero/swiss_join_avx2.cc index d5c0b7817f55f..0888dd8938455 100644 --- a/cpp/src/arrow/acero/swiss_join_avx2.cc +++ b/cpp/src/arrow/acero/swiss_join_avx2.cc @@ -23,8 +23,6 @@ namespace arrow { namespace acero { -#if defined(ARROW_HAVE_AVX2) - template int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int num_rows, const uint32_t* row_ids, @@ -191,7 +189,5 @@ int RowArrayAccessor::VisitNulls_avx2(const RowTableImpl& rows, int column_id, return num_rows - (num_rows % unroll); } -#endif - } // namespace acero } // namespace arrow diff --git a/cpp/src/arrow/acero/swiss_join_internal.h b/cpp/src/arrow/acero/swiss_join_internal.h index cd12b34a0c6dc..88b80f06f57f2 100644 --- a/cpp/src/arrow/acero/swiss_join_internal.h +++ b/cpp/src/arrow/acero/swiss_join_internal.h @@ -80,7 +80,7 @@ class RowArrayAccessor { const uint32_t* row_ids, PROCESS_VALUE_FN process_value_fn); private: -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) // This is equivalent to Visit method, but processing 8 rows at a time in a // loop. // Returns the number of processed rows, which may be less than requested (up diff --git a/cpp/src/arrow/acero/test_util_internal.cc b/cpp/src/arrow/acero/test_util_internal.cc index 2042650be6acb..f50ca92238dc4 100644 --- a/cpp/src/arrow/acero/test_util_internal.cc +++ b/cpp/src/arrow/acero/test_util_internal.cc @@ -45,8 +45,10 @@ #include "arrow/testing/builder.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" +#include "arrow/testing/util.h" #include "arrow/type.h" #include "arrow/util/async_generator.h" +#include "arrow/util/cpu_info.h" #include "arrow/util/iterator.h" #include "arrow/util/logging.h" #include "arrow/util/unreachable.h" @@ -54,6 +56,7 @@ namespace arrow { +using arrow::internal::CpuInfo; using arrow::internal::Executor; using compute::SortKey; @@ -62,6 +65,7 @@ using compute::Take; namespace acero { namespace { + void ValidateOutputImpl(const ArrayData& output) { ASSERT_OK(::arrow::internal::ValidateArrayFull(output)); TestInitialized(output); @@ -116,6 +120,11 @@ void ValidateOutput(const Datum& output) { } } +std::vector HardwareFlagsForTesting() { + // Acero currently only has AVX2 optimizations + return arrow::GetSupportedHardwareFlags({CpuInfo::AVX2}); +} + namespace { struct DummyNode : ExecNode { diff --git a/cpp/src/arrow/acero/test_util_internal.h b/cpp/src/arrow/acero/test_util_internal.h index 03f417028650b..569fb1254db4a 100644 --- a/cpp/src/arrow/acero/test_util_internal.h +++ b/cpp/src/arrow/acero/test_util_internal.h @@ -20,6 +20,7 @@ #include "arrow/testing/gtest_util.h" #include "arrow/util/vector.h" +#include #include #include #include @@ -33,12 +34,14 @@ #include "arrow/util/async_generator.h" #include "arrow/util/pcg_random.h" -namespace arrow { - -namespace acero { +namespace arrow::acero { void ValidateOutput(const Datum& output); +// Enumerate all hardware flags that can be tested on this platform +// and would lead to different code paths being tested in Acero. +std::vector HardwareFlagsForTesting(); + using StartProducingFunc = std::function; using StopProducingFunc = std::function; @@ -204,5 +207,4 @@ struct TableGenerationProperties { Result> MakeRandomTimeSeriesTable( const TableGenerationProperties& properties); -} // namespace acero -} // namespace arrow +} // namespace arrow::acero diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt index a17d6275a763a..0bd6fe86134ab 100644 --- a/cpp/src/arrow/compute/kernels/CMakeLists.txt +++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt @@ -18,11 +18,20 @@ # ---------------------------------------------------------------------- # Tests that don't require the full kernel library +# Define arrow_compute_testing object library for common test files +if(ARROW_TESTING) + add_library(arrow_compute_kernels_testing OBJECT test_util.cc) + # Even though this is still just an object library we still need to "link" our + # dependencies so that include paths are configured correctly + target_link_libraries(arrow_compute_kernels_testing ${ARROW_GTEST_GTEST}) +endif() + add_arrow_test(scalar_cast_test ${ARROW_COMPUTE_TEST_ARGS} SOURCES scalar_cast_test.cc - test_util.cc) + EXTRA_LINK_LIBS + arrow_compute_kernels_testing) # ---------------------------------------------------------------------- # Scalar kernels @@ -32,25 +41,36 @@ add_arrow_compute_test(scalar_type_test scalar_boolean_test.cc scalar_nested_test.cc scalar_string_test.cc - test_util.cc) + EXTRA_LINK_LIBS + arrow_compute_kernels_testing) -add_arrow_compute_test(scalar_if_else_test SOURCES scalar_if_else_test.cc test_util.cc) +add_arrow_compute_test(scalar_if_else_test + SOURCES + scalar_if_else_test.cc + EXTRA_LINK_LIBS + arrow_compute_kernels_testing) -add_arrow_compute_test(scalar_temporal_test SOURCES scalar_temporal_test.cc test_util.cc) +add_arrow_compute_test(scalar_temporal_test + SOURCES + scalar_temporal_test.cc + EXTRA_LINK_LIBS + arrow_compute_kernels_testing) add_arrow_compute_test(scalar_math_test SOURCES scalar_arithmetic_test.cc scalar_compare_test.cc scalar_round_arithmetic_test.cc - test_util.cc) + EXTRA_LINK_LIBS + arrow_compute_kernels_testing) add_arrow_compute_test(scalar_utility_test SOURCES scalar_random_test.cc scalar_set_lookup_test.cc scalar_validity_test.cc - test_util.cc) + EXTRA_LINK_LIBS + arrow_compute_kernels_testing) add_arrow_benchmark(scalar_arithmetic_benchmark PREFIX "arrow-compute") add_arrow_benchmark(scalar_boolean_benchmark PREFIX "arrow-compute") @@ -75,12 +95,20 @@ add_arrow_compute_test(vector_test vector_replace_test.cc vector_run_end_encode_test.cc select_k_test.cc - test_util.cc) + EXTRA_LINK_LIBS + arrow_compute_kernels_testing) -add_arrow_compute_test(vector_sort_test SOURCES vector_sort_test.cc test_util.cc) +add_arrow_compute_test(vector_sort_test + SOURCES + vector_sort_test.cc + EXTRA_LINK_LIBS + arrow_compute_kernels_testing) -add_arrow_compute_test(vector_selection_test SOURCES vector_selection_test.cc - test_util.cc) +add_arrow_compute_test(vector_selection_test + SOURCES + vector_selection_test.cc + EXTRA_LINK_LIBS + arrow_compute_kernels_testing) add_arrow_benchmark(vector_hash_benchmark PREFIX "arrow-compute") add_arrow_benchmark(vector_sort_benchmark PREFIX "arrow-compute") @@ -94,7 +122,11 @@ add_arrow_benchmark(vector_selection_benchmark PREFIX "arrow-compute") # Aggregates -add_arrow_compute_test(aggregate_test SOURCES aggregate_test.cc test_util.cc) +add_arrow_compute_test(aggregate_test + SOURCES + aggregate_test.cc + EXTRA_LINK_LIBS + arrow_compute_kernels_testing) # ---------------------------------------------------------------------- # Utilities diff --git a/cpp/src/arrow/compute/key_hash.cc b/cpp/src/arrow/compute/key_hash.cc index 3fcfbf3d8312d..f5867b405ec71 100644 --- a/cpp/src/arrow/compute/key_hash.cc +++ b/cpp/src/arrow/compute/key_hash.cc @@ -236,7 +236,7 @@ void Hashing32::HashVarLen(int64_t hardware_flags, bool combine_hashes, uint32_t const uint32_t* offsets, const uint8_t* concatenated_keys, uint32_t* hashes, uint32_t* hashes_temp_for_combine) { uint32_t num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (hardware_flags & arrow::internal::CpuInfo::AVX2) { num_processed = HashVarLen_avx2(combine_hashes, num_rows, offsets, concatenated_keys, hashes, hashes_temp_for_combine); @@ -255,7 +255,7 @@ void Hashing32::HashVarLen(int64_t hardware_flags, bool combine_hashes, uint32_t const uint64_t* offsets, const uint8_t* concatenated_keys, uint32_t* hashes, uint32_t* hashes_temp_for_combine) { uint32_t num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (hardware_flags & arrow::internal::CpuInfo::AVX2) { num_processed = HashVarLen_avx2(combine_hashes, num_rows, offsets, concatenated_keys, hashes, hashes_temp_for_combine); @@ -361,7 +361,7 @@ void Hashing32::HashFixed(int64_t hardware_flags, bool combine_hashes, uint32_t } uint32_t num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (hardware_flags & arrow::internal::CpuInfo::AVX2) { num_processed = HashFixedLen_avx2(combine_hashes, num_rows, length, keys, hashes, hashes_temp_for_combine); diff --git a/cpp/src/arrow/compute/key_hash.h b/cpp/src/arrow/compute/key_hash.h index e43d7b8df523d..b193716c9bdfd 100644 --- a/cpp/src/arrow/compute/key_hash.h +++ b/cpp/src/arrow/compute/key_hash.h @@ -17,7 +17,7 @@ #pragma once -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) #include #endif @@ -115,7 +115,7 @@ class ARROW_EXPORT Hashing32 { static void HashInt(bool combine_hashes, uint32_t num_keys, uint64_t length_key, const uint8_t* keys, uint32_t* hashes); -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) static inline __m256i Avalanche_avx2(__m256i hash); static inline __m256i CombineHashesImp_avx2(__m256i previous_hash, __m256i hash); template diff --git a/cpp/src/arrow/compute/key_hash_avx2.cc b/cpp/src/arrow/compute/key_hash_avx2.cc index f30c3460bda60..1b444b576784f 100644 --- a/cpp/src/arrow/compute/key_hash_avx2.cc +++ b/cpp/src/arrow/compute/key_hash_avx2.cc @@ -23,8 +23,6 @@ namespace arrow { namespace compute { -#if defined(ARROW_HAVE_AVX2) - inline __m256i Hashing32::Avalanche_avx2(__m256i hash) { hash = _mm256_xor_si256(hash, _mm256_srli_epi32(hash, 15)); hash = _mm256_mullo_epi32(hash, _mm256_set1_epi32(PRIME32_2)); @@ -315,7 +313,5 @@ uint32_t Hashing32::HashVarLen_avx2(bool combine_hashes, uint32_t num_rows, } } -#endif - } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/key_hash_test.cc b/cpp/src/arrow/compute/key_hash_test.cc index d10645391b413..3e6d41525cf44 100644 --- a/cpp/src/arrow/compute/key_hash_test.cc +++ b/cpp/src/arrow/compute/key_hash_test.cc @@ -21,18 +21,26 @@ #include #include #include + #include "arrow/array/builder_binary.h" #include "arrow/compute/key_hash.h" #include "arrow/testing/gtest_util.h" +#include "arrow/testing/util.h" #include "arrow/util/cpu_info.h" #include "arrow/util/pcg_random.h" namespace arrow { using internal::checked_pointer_cast; +using internal::CpuInfo; namespace compute { +std::vector HardwareFlagsForTesting() { + // Our key-hash and key-map routines currently only have AVX2 optimizations + return GetSupportedHardwareFlags({CpuInfo::AVX2}); +} + class TestVectorHash { private: template ::ArrayType> @@ -131,85 +139,79 @@ class TestVectorHash { const offset_t* key_offsets = reinterpret_cast(keys_array->raw_value_offsets()); - std::vector hashes_scalar32; - std::vector hashes_scalar64; - hashes_scalar32.resize(num_rows); - hashes_scalar64.resize(num_rows); - std::vector hashes_simd32; - std::vector hashes_simd64; - hashes_simd32.resize(num_rows); - hashes_simd64.resize(num_rows); - - int64_t hardware_flags_scalar = 0LL; - int64_t hardware_flags_simd = ::arrow::internal::CpuInfo::AVX2; + // For each tested hardware flags, we will compute the hashes and check + // them for consistency. + const auto hardware_flags_for_testing = HardwareFlagsForTesting(); + ASSERT_GT(hardware_flags_for_testing.size(), 0); + std::vector> hashes32(hardware_flags_for_testing.size()); + std::vector> hashes64(hardware_flags_for_testing.size()); + for (auto& h : hashes32) { + h.resize(num_rows); + } + for (auto& h : hashes64) { + h.resize(num_rows); + } constexpr int mini_batch_size = 1024; std::vector temp_buffer; temp_buffer.resize(mini_batch_size * 4); - for (bool use_simd : {false, true}) { + for (int i = 0; i < static_cast(hardware_flags_for_testing.size()); ++i) { + const auto hardware_flags = hardware_flags_for_testing[i]; if (use_32bit_hash) { if (!use_varlen_input) { - Hashing32::HashFixed(use_simd ? hardware_flags_simd : hardware_flags_scalar, + Hashing32::HashFixed(hardware_flags, /*combine_hashes=*/false, num_rows, fixed_length, keys, - use_simd ? hashes_simd32.data() : hashes_scalar32.data(), - temp_buffer.data()); + hashes32[i].data(), temp_buffer.data()); } else { for (int first_row = 0; first_row < num_rows;) { int batch_size_next = std::min(num_rows - first_row, mini_batch_size); - Hashing32::HashVarLen( - use_simd ? hardware_flags_simd : hardware_flags_scalar, - /*combine_hashes=*/false, batch_size_next, key_offsets + first_row, keys, - (use_simd ? hashes_simd32.data() : hashes_scalar32.data()) + first_row, - temp_buffer.data()); + Hashing32::HashVarLen(hardware_flags, + /*combine_hashes=*/false, batch_size_next, + key_offsets + first_row, keys, + hashes32[i].data() + first_row, temp_buffer.data()); first_row += batch_size_next; } } + for (int j = 0; j < num_rows; ++j) { + hashes64[i][j] = hashes32[i][j]; + } } else { if (!use_varlen_input) { Hashing64::HashFixed( - /*combine_hashes=*/false, num_rows, fixed_length, keys, - use_simd ? hashes_simd64.data() : hashes_scalar64.data()); + /*combine_hashes=*/false, num_rows, fixed_length, keys, hashes64[i].data()); } else { Hashing64::HashVarLen( - /*combine_hashes=*/false, num_rows, key_offsets, keys, - use_simd ? hashes_simd64.data() : hashes_scalar64.data()); + /*combine_hashes=*/false, num_rows, key_offsets, keys, hashes64[i].data()); } } } - if (use_32bit_hash) { - for (int i = 0; i < num_rows; ++i) { - hashes_scalar64[i] = hashes_scalar32[i]; - hashes_simd64[i] = hashes_simd32[i]; - } - } - - // Verify that both scalar and AVX2 implementations give the same hashes + // Verify that all implementations (scalar, SIMD) give the same hashes // - for (int i = 0; i < num_rows; ++i) { - ASSERT_EQ(hashes_scalar64[i], hashes_simd64[i]) - << "scalar and simd approaches yielded different hashes"; + const auto& hashes_scalar64 = hashes64[0]; + for (int i = 0; i < static_cast(hardware_flags_for_testing.size()); ++i) { + for (int j = 0; j < num_rows; ++j) { + ASSERT_EQ(hashes64[i][j], hashes_scalar64[j]) + << "scalar and simd approaches yielded different hashes"; + } } // Verify that the same key appearing multiple times generates the same hash // each time. Measure the number of unique hashes and compare to the number // of unique keys. // - std::map unique_key_to_hash; - std::set unique_hashes; + std::unordered_map unique_key_to_hash; + std::unordered_set unique_hashes; for (int i = 0; i < num_rows; ++i) { - std::map::iterator iter = unique_key_to_hash.find(row_ids[i]); - if (iter == unique_key_to_hash.end()) { - unique_key_to_hash.insert(std::make_pair(row_ids[i], hashes_scalar64[i])); - } else { - ASSERT_EQ(iter->second, hashes_scalar64[i]); - } - if (unique_hashes.find(hashes_scalar64[i]) == unique_hashes.end()) { - unique_hashes.insert(hashes_scalar64[i]); + auto [it, inserted] = + unique_key_to_hash.try_emplace(row_ids[i], hashes_scalar64[i]); + if (!inserted) { + ASSERT_EQ(it->second, hashes_scalar64[i]); } + unique_hashes.insert(hashes_scalar64[i]); } float percent_hash_collisions = 100.0f * static_cast(num_unique - unique_hashes.size()) / diff --git a/cpp/src/arrow/compute/key_map.cc b/cpp/src/arrow/compute/key_map.cc index fd5c404a07f8d..71ca56c91a9ff 100644 --- a/cpp/src/arrow/compute/key_map.cc +++ b/cpp/src/arrow/compute/key_map.cc @@ -133,7 +133,7 @@ void SwissTable::extract_group_ids(const int num_keys, const uint16_t* optional_ // Optimistically use simplified lookup involving only a start block to find // a single group id candidate for every input. -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) int num_group_id_bytes = num_group_id_bits / 8; if ((hardware_flags_ & arrow::internal::CpuInfo::AVX2) && !optional_selection) { num_processed = extract_group_ids_avx2(num_keys, hashes, local_slots, out_group_ids, @@ -301,7 +301,7 @@ void SwissTable::early_filter(const int num_keys, const uint32_t* hashes, // Optimistically use simplified lookup involving only a start block to find // a single group id candidate for every input. int num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (hardware_flags_ & arrow::internal::CpuInfo::AVX2) { if (log_blocks_ <= 4) { num_processed = early_filter_imp_avx2_x32(num_keys, hashes, out_match_bitvector, diff --git a/cpp/src/arrow/compute/key_map.h b/cpp/src/arrow/compute/key_map.h index 7ab48470f21e4..95fb3be274288 100644 --- a/cpp/src/arrow/compute/key_map.h +++ b/cpp/src/arrow/compute/key_map.h @@ -163,7 +163,7 @@ class ARROW_EXPORT SwissTable { // void early_filter_imp(const int num_keys, const uint32_t* hashes, uint8_t* out_match_bitvector, uint8_t* out_local_slots) const; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) int early_filter_imp_avx2_x8(const int num_hashes, const uint32_t* hashes, uint8_t* out_match_bitvector, uint8_t* out_local_slots) const; diff --git a/cpp/src/arrow/compute/key_map_avx2.cc b/cpp/src/arrow/compute/key_map_avx2.cc index eb318ff188fbb..731553511044f 100644 --- a/cpp/src/arrow/compute/key_map_avx2.cc +++ b/cpp/src/arrow/compute/key_map_avx2.cc @@ -23,8 +23,6 @@ namespace arrow { namespace compute { -#if defined(ARROW_HAVE_AVX2) - // This is more or less translation of equivalent scalar code, adjusted for a // different instruction set (e.g. missing leading zero count instruction). // @@ -412,7 +410,5 @@ int SwissTable::extract_group_ids_avx2(const int num_keys, const uint32_t* hashe return num_keys - (num_keys % unroll); } -#endif - } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/row/compare_internal.cc b/cpp/src/arrow/compute/row/compare_internal.cc index 39ac33932b548..7c402e7a2384d 100644 --- a/cpp/src/arrow/compute/row/compare_internal.cc +++ b/cpp/src/arrow/compute/row/compare_internal.cc @@ -42,7 +42,7 @@ void KeyCompare::NullUpdateColumnToRow(uint32_t id_col, uint32_t num_rows_to_com return; } uint32_t num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (ctx->has_avx2()) { num_processed = NullUpdateColumnToRow_avx2(use_selection, id_col, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, @@ -130,7 +130,7 @@ void KeyCompare::CompareBinaryColumnToRow(uint32_t offset_within_row, const RowTableImpl& rows, uint8_t* match_bytevector) { uint32_t num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (ctx->has_avx2()) { num_processed = CompareBinaryColumnToRow_avx2( use_selection, offset_within_row, num_rows_to_compare, sel_left_maybe_null, @@ -297,7 +297,7 @@ void KeyCompare::CompareVarBinaryColumnToRow(uint32_t id_varbinary_col, const RowTableImpl& rows, uint8_t* match_bytevector) { uint32_t num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (ctx->has_avx2()) { num_processed = CompareVarBinaryColumnToRow_avx2( use_selection, is_first_varbinary_col, id_varbinary_col, num_rows_to_compare, @@ -313,7 +313,7 @@ void KeyCompare::CompareVarBinaryColumnToRow(uint32_t id_varbinary_col, void KeyCompare::AndByteVectors(LightContext* ctx, uint32_t num_elements, uint8_t* bytevector_A, const uint8_t* bytevector_B) { uint32_t num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (ctx->has_avx2()) { num_processed = AndByteVectors_avx2(num_elements, bytevector_A, bytevector_B); } diff --git a/cpp/src/arrow/compute/row/compare_internal.h b/cpp/src/arrow/compute/row/compare_internal.h index 638b8c2ec721f..db953fbe11271 100644 --- a/cpp/src/arrow/compute/row/compare_internal.h +++ b/cpp/src/arrow/compute/row/compare_internal.h @@ -86,7 +86,7 @@ class ARROW_EXPORT KeyCompare { static void AndByteVectors(LightContext* ctx, uint32_t num_elements, uint8_t* bytevector_A, const uint8_t* bytevector_B); -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) template static uint32_t NullUpdateColumnToRowImp_avx2( diff --git a/cpp/src/arrow/compute/row/compare_internal_avx2.cc b/cpp/src/arrow/compute/row/compare_internal_avx2.cc index 95f37ab617db5..ff407c51b83cb 100644 --- a/cpp/src/arrow/compute/row/compare_internal_avx2.cc +++ b/cpp/src/arrow/compute/row/compare_internal_avx2.cc @@ -24,8 +24,6 @@ namespace arrow { namespace compute { -#if defined(ARROW_HAVE_AVX2) - inline __m256i set_first_n_bytes_avx2(int n) { constexpr uint64_t kByteSequence0To7 = 0x0706050403020100ULL; constexpr uint64_t kByteSequence8To15 = 0x0f0e0d0c0b0a0908ULL; @@ -670,7 +668,5 @@ uint32_t KeyCompare::CompareVarBinaryColumnToRow_avx2( return num_rows_to_compare; } -#endif - } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/row/encode_internal.cc b/cpp/src/arrow/compute/row/encode_internal.cc index 3a6a85b0272f8..01d552ef8270f 100644 --- a/cpp/src/arrow/compute/row/encode_internal.cc +++ b/cpp/src/arrow/compute/row/encode_internal.cc @@ -455,7 +455,7 @@ void EncoderBinary::Decode(uint32_t start_row, uint32_t num_rows, bool is_row_fixed_length = rows.metadata().is_fixed_length; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (ctx->has_avx2()) { DecodeHelper_avx2(is_row_fixed_length, start_row, num_rows, offset_within_row, rows, col); @@ -466,7 +466,7 @@ void EncoderBinary::Decode(uint32_t start_row, uint32_t num_rows, } else { DecodeImp(start_row, num_rows, offset_within_row, rows, col); } -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) } #endif @@ -524,7 +524,7 @@ void EncoderBinaryPair::Decode(uint32_t start_row, uint32_t num_rows, bool is_row_fixed_length = rows.metadata().is_fixed_length; uint32_t num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (ctx->has_avx2() && col_width1 == col_width2) { num_processed = DecodeHelper_avx2(is_row_fixed_length, col_width1, start_row, num_rows, @@ -772,7 +772,7 @@ void EncoderVarBinary::Decode(uint32_t start_row, uint32_t num_rows, KeyColumnArray* col, LightContext* ctx) { // Output column varbinary buffer needs an extra 32B // at the end in avx2 version and 8B otherwise. -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (ctx->has_avx2()) { DecodeHelper_avx2(start_row, num_rows, varbinary_col_id, rows, col); } else { @@ -782,7 +782,7 @@ void EncoderVarBinary::Decode(uint32_t start_row, uint32_t num_rows, } else { DecodeImp(start_row, num_rows, varbinary_col_id, rows, col); } -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) } #endif } diff --git a/cpp/src/arrow/compute/row/encode_internal.h b/cpp/src/arrow/compute/row/encode_internal.h index b83767b694cfd..6091fb66982af 100644 --- a/cpp/src/arrow/compute/row/encode_internal.h +++ b/cpp/src/arrow/compute/row/encode_internal.h @@ -187,7 +187,7 @@ class EncoderBinary { template static void DecodeImp(uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row, const RowTableImpl& rows, KeyColumnArray* col); -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) static void DecodeHelper_avx2(bool is_row_fixed_length, uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row, const RowTableImpl& rows, KeyColumnArray* col); @@ -213,7 +213,7 @@ class EncoderBinaryPair { static void DecodeImp(uint32_t num_rows_to_skip, uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row, const RowTableImpl& rows, KeyColumnArray* col1, KeyColumnArray* col2); -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) static uint32_t DecodeHelper_avx2(bool is_row_fixed_length, uint32_t col_width, uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row, const RowTableImpl& rows, @@ -300,7 +300,7 @@ class EncoderVarBinary { template static void DecodeImp(uint32_t start_row, uint32_t num_rows, uint32_t varbinary_col_id, const RowTableImpl& rows, KeyColumnArray* col); -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) static void DecodeHelper_avx2(uint32_t start_row, uint32_t num_rows, uint32_t varbinary_col_id, const RowTableImpl& rows, KeyColumnArray* col); diff --git a/cpp/src/arrow/compute/row/encode_internal_avx2.cc b/cpp/src/arrow/compute/row/encode_internal_avx2.cc index 02ba310bded20..50969c7bd6034 100644 --- a/cpp/src/arrow/compute/row/encode_internal_avx2.cc +++ b/cpp/src/arrow/compute/row/encode_internal_avx2.cc @@ -22,8 +22,6 @@ namespace arrow { namespace compute { -#if defined(ARROW_HAVE_AVX2) - void EncoderBinary::DecodeHelper_avx2(bool is_row_fixed_length, uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row, const RowTableImpl& rows, KeyColumnArray* col) { @@ -230,7 +228,5 @@ void EncoderVarBinary::DecodeImp_avx2(uint32_t start_row, uint32_t num_rows, }); } -#endif - } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/util.cc b/cpp/src/arrow/compute/util.cc index f69f60a5af434..faf3e0c87e4d2 100644 --- a/cpp/src/arrow/compute/util.cc +++ b/cpp/src/arrow/compute/util.cc @@ -118,7 +118,7 @@ void bits_to_indexes_internal(int64_t hardware_flags, const int num_bits, // 64 bits at a time constexpr int unroll = 64; int tail = num_bits % unroll; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (hardware_flags & arrow::internal::CpuInfo::AVX2) { if (filter_input_indexes) { avx2::bits_filter_indexes_avx2(bit_to_search, num_bits - tail, bits, input_indexes, @@ -141,7 +141,7 @@ void bits_to_indexes_internal(int64_t hardware_flags, const int num_bits, bits_to_indexes_helper(word, i * 64 + base_index, num_indexes, indexes); } } -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) } #endif // Optionally process the last partial word with masking out bits outside range @@ -253,7 +253,7 @@ void bits_to_bytes(int64_t hardware_flags, const int num_bits, const uint8_t* bi } int num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (hardware_flags & arrow::internal::CpuInfo::AVX2) { // The function call below processes whole 32 bit chunks together. num_processed = num_bits - (num_bits % 32); @@ -309,7 +309,7 @@ void bytes_to_bits(int64_t hardware_flags, const int num_bits, const uint8_t* by } int num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (hardware_flags & arrow::internal::CpuInfo::AVX2) { // The function call below processes whole 32 bit chunks together. num_processed = num_bits - (num_bits % 32); @@ -339,7 +339,7 @@ void bytes_to_bits(int64_t hardware_flags, const int num_bits, const uint8_t* by bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes, uint32_t num_bytes) { -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (hardware_flags & arrow::internal::CpuInfo::AVX2) { return avx2::are_all_bytes_zero_avx2(bytes, num_bytes); } diff --git a/cpp/src/arrow/compute/util.h b/cpp/src/arrow/compute/util.h index 489139eab87f2..730e59f346a52 100644 --- a/cpp/src/arrow/compute/util.h +++ b/cpp/src/arrow/compute/util.h @@ -168,7 +168,7 @@ ARROW_EXPORT void bytes_to_bits(int64_t hardware_flags, const int num_bits, ARROW_EXPORT bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes, uint32_t num_bytes); -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) namespace avx2 { ARROW_EXPORT void bits_filter_indexes_avx2(int bit_to_search, const int num_bits, diff --git a/cpp/src/arrow/compute/util_avx2.cc b/cpp/src/arrow/compute/util_avx2.cc index 89ec6aa97a608..0191ab06f9532 100644 --- a/cpp/src/arrow/compute/util_avx2.cc +++ b/cpp/src/arrow/compute/util_avx2.cc @@ -21,9 +21,7 @@ #include "arrow/util/bit_util.h" #include "arrow/util/logging.h" -#if defined(ARROW_HAVE_AVX2) - -namespace arrow::util::avx2 { +namespace arrow::util::bit_util::avx2 { template void bits_to_indexes_imp_avx2(const int num_bits, const uint8_t* bits, int* num_indexes, @@ -211,6 +209,4 @@ bool are_all_bytes_zero_avx2(const uint8_t* bytes, uint32_t num_bytes) { return result_or32 == 0; } -} // namespace arrow::util::avx2 - -#endif // ARROW_HAVE_AVX2 +} // namespace arrow::util::bit_util::avx2 diff --git a/cpp/src/arrow/testing/util.cc b/cpp/src/arrow/testing/util.cc index b59854480765b..e8a782575e278 100644 --- a/cpp/src/arrow/testing/util.cc +++ b/cpp/src/arrow/testing/util.cc @@ -43,6 +43,7 @@ #include "arrow/table.h" #include "arrow/testing/random.h" #include "arrow/type.h" +#include "arrow/util/cpu_info.h" #include "arrow/util/io_util.h" #include "arrow/util/logging.h" #include "arrow/util/pcg_random.h" @@ -211,4 +212,18 @@ const std::vector>& all_dictionary_index_types() { return types; } +std::vector GetSupportedHardwareFlags( + const std::vector& candidate_flags) { + std::vector hardware_flags; + // Always test fallback codepaths + hardware_flags.push_back(0); + for (const int64_t candidate_flag : candidate_flags) { + if (candidate_flag != 0 && + internal::CpuInfo::GetInstance()->IsSupported(candidate_flag)) { + hardware_flags.push_back(candidate_flag); + } + } + return hardware_flags; +} + } // namespace arrow diff --git a/cpp/src/arrow/testing/util.h b/cpp/src/arrow/testing/util.h index 4f4b03438fd58..b4b2785a36292 100644 --- a/cpp/src/arrow/testing/util.h +++ b/cpp/src/arrow/testing/util.h @@ -131,4 +131,10 @@ ARROW_TESTING_EXPORT std::string GetListenAddress(); ARROW_TESTING_EXPORT const std::vector>& all_dictionary_index_types(); +// Get a list of supported hardware flags from the given candidates. +// The result will always contain 0, meaning no optional CPU feature enabled at all. +ARROW_TESTING_EXPORT +std::vector GetSupportedHardwareFlags( + const std::vector& candidate_flags); + } // namespace arrow diff --git a/cpp/src/arrow/util/byte_stream_split.h b/cpp/src/arrow/util/byte_stream_split.h index 28dcce52bb8fc..d428df0659b28 100644 --- a/cpp/src/arrow/util/byte_stream_split.h +++ b/cpp/src/arrow/util/byte_stream_split.h @@ -39,9 +39,9 @@ void ByteStreamSplitDecodeSse2(const uint8_t* data, int64_t num_values, int64_t constexpr size_t kNumStreams = sizeof(T); static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams."); constexpr size_t kNumStreamsLog2 = (kNumStreams == 8U ? 3U : 2U); + constexpr int64_t kBlockSize = sizeof(__m128i) * kNumStreams; const int64_t size = num_values * sizeof(T); - constexpr int64_t kBlockSize = sizeof(__m128i) * kNumStreams; const int64_t num_blocks = size / kBlockSize; uint8_t* output_data = reinterpret_cast(out); @@ -92,11 +92,12 @@ void ByteStreamSplitEncodeSse2(const uint8_t* raw_values, const size_t num_value uint8_t* output_buffer_raw) { constexpr size_t kNumStreams = sizeof(T); static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams."); + constexpr size_t kBlockSize = sizeof(__m128i) * kNumStreams; + __m128i stage[3][kNumStreams]; __m128i final_result[kNumStreams]; const size_t size = num_values * sizeof(T); - constexpr size_t kBlockSize = sizeof(__m128i) * kNumStreams; const size_t num_blocks = size / kBlockSize; const __m128i* raw_values_sse = reinterpret_cast(raw_values); __m128i* output_buffer_streams[kNumStreams]; @@ -143,7 +144,7 @@ void ByteStreamSplitEncodeSse2(const uint8_t* raw_values, const size_t num_value _mm_unpackhi_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 1]); } } - if (kNumStreams == 8U) { + if constexpr (kNumStreams == 8U) { // This is the path for double. __m128i tmp[8]; for (size_t i = 0; i < 4; ++i) { @@ -181,9 +182,9 @@ void ByteStreamSplitDecodeAvx2(const uint8_t* data, int64_t num_values, int64_t constexpr size_t kNumStreams = sizeof(T); static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams."); constexpr size_t kNumStreamsLog2 = (kNumStreams == 8U ? 3U : 2U); + constexpr int64_t kBlockSize = sizeof(__m256i) * kNumStreams; const int64_t size = num_values * sizeof(T); - constexpr int64_t kBlockSize = sizeof(__m256i) * kNumStreams; if (size < kBlockSize) // Back to SSE for small size return ByteStreamSplitDecodeSse2(data, num_values, stride, out); const int64_t num_blocks = size / kBlockSize; @@ -220,7 +221,7 @@ void ByteStreamSplitDecodeAvx2(const uint8_t* data, int64_t num_values, int64_t } } - if (kNumStreams == 8U) { + if constexpr (kNumStreams == 8U) { // path for double, 128i index: // {0x00, 0x08}, {0x01, 0x09}, {0x02, 0x0A}, {0x03, 0x0B}, // {0x04, 0x0C}, {0x05, 0x0D}, {0x06, 0x0E}, {0x07, 0x0F}, @@ -266,11 +267,12 @@ void ByteStreamSplitEncodeAvx2(const uint8_t* raw_values, const size_t num_value uint8_t* output_buffer_raw) { constexpr size_t kNumStreams = sizeof(T); static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams."); - if (kNumStreams == 8U) // Back to SSE, currently no path for double. + constexpr size_t kBlockSize = sizeof(__m256i) * kNumStreams; + + if constexpr (kNumStreams == 8U) // Back to SSE, currently no path for double. return ByteStreamSplitEncodeSse2(raw_values, num_values, output_buffer_raw); const size_t size = num_values * sizeof(T); - constexpr size_t kBlockSize = sizeof(__m256i) * kNumStreams; if (size < kBlockSize) // Back to SSE for small size return ByteStreamSplitEncodeSse2(raw_values, num_values, output_buffer_raw); const size_t num_blocks = size / kBlockSize; @@ -339,9 +341,9 @@ void ByteStreamSplitDecodeAvx512(const uint8_t* data, int64_t num_values, int64_ constexpr size_t kNumStreams = sizeof(T); static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams."); constexpr size_t kNumStreamsLog2 = (kNumStreams == 8U ? 3U : 2U); + constexpr int64_t kBlockSize = sizeof(__m512i) * kNumStreams; const int64_t size = num_values * sizeof(T); - constexpr int64_t kBlockSize = sizeof(__m512i) * kNumStreams; if (size < kBlockSize) // Back to AVX2 for small size return ByteStreamSplitDecodeAvx2(data, num_values, stride, out); const int64_t num_blocks = size / kBlockSize; @@ -379,7 +381,7 @@ void ByteStreamSplitDecodeAvx512(const uint8_t* data, int64_t num_values, int64_ } } - if (kNumStreams == 8U) { + if constexpr (kNumStreams == 8U) { // path for double, 128i index: // {0x00, 0x04, 0x08, 0x0C}, {0x10, 0x14, 0x18, 0x1C}, // {0x01, 0x05, 0x09, 0x0D}, {0x11, 0x15, 0x19, 0x1D}, @@ -442,8 +444,10 @@ void ByteStreamSplitEncodeAvx512(const uint8_t* raw_values, const size_t num_val uint8_t* output_buffer_raw) { constexpr size_t kNumStreams = sizeof(T); static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams."); - const size_t size = num_values * sizeof(T); constexpr size_t kBlockSize = sizeof(__m512i) * kNumStreams; + + const size_t size = num_values * sizeof(T); + if (size < kBlockSize) // Back to AVX2 for small size return ByteStreamSplitEncodeAvx2(raw_values, num_values, output_buffer_raw); @@ -469,7 +473,7 @@ void ByteStreamSplitEncodeAvx512(const uint8_t* raw_values, const size_t num_val __m512i unpack[KNumUnpack + 1][kNumStreams]; __m512i permutex[kNumStreams]; __m512i permutex_mask; - if (kNumStreams == 8U) { + if constexpr (kNumStreams == 8U) { // use _mm512_set_epi32, no _mm512_set_epi16 for some old gcc version. permutex_mask = _mm512_set_epi32(0x001F0017, 0x000F0007, 0x001E0016, 0x000E0006, 0x001D0015, 0x000D0005, 0x001C0014, 0x000C0004, @@ -494,7 +498,7 @@ void ByteStreamSplitEncodeAvx512(const uint8_t* raw_values, const size_t num_val } } - if (kNumStreams == 8U) { + if constexpr (kNumStreams == 8U) { // path for double // 1. unpack to epi16 block // 2. permutexvar_epi16 to 128i block diff --git a/docker-compose.yml b/docker-compose.yml index fbb879b2bc54b..8727aded2c825 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -72,6 +72,10 @@ x-sccache: &sccache SCCACHE_REGION: SCCACHE_S3_KEY_PREFIX: ${SCCACHE_S3_KEY_PREFIX:-sccache} +x-cpp: &cpp + ARROW_RUNTIME_SIMD_LEVEL: + ARROW_SIMD_LEVEL: + # CPU/memory limit presets to pass to Docker. # # Usage: archery docker run --resource-limit=github @@ -227,7 +231,7 @@ services: ulimits: &ulimits core: ${ULIMIT_CORE} environment: - <<: [*common, *ccache] + <<: [*common, *ccache, *cpp] ARROW_ENABLE_TIMING_TESTS: # inherit ARROW_MIMALLOC: "ON" volumes: &alpine-linux-volumes @@ -278,7 +282,7 @@ services: shm_size: *shm-size ulimits: *ulimits environment: - <<: [*common, *ccache, *sccache] + <<: [*common, *ccache, *sccache, *cpp] ARROW_BUILD_BENCHMARKS: "ON" ARROW_BUILD_EXAMPLES: "ON" ARROW_ENABLE_TIMING_TESTS: # inherit @@ -313,7 +317,7 @@ services: arch: ${ARCH} shm_size: *shm-size environment: - <<: [*common, *ccache, *sccache] + <<: [*common, *ccache, *sccache, *cpp] # Shrink test runtime by enabling minimal optimizations ARROW_C_FLAGS_DEBUG: "-g1 -Og" ARROW_CXX_FLAGS_DEBUG: "-g1 -Og" @@ -349,7 +353,7 @@ services: shm_size: *shm-size ulimits: *ulimits environment: - <<: [*common, *ccache, *sccache] + <<: [*common, *ccache, *sccache, *cpp] ARROW_ENABLE_TIMING_TESTS: # inherit ARROW_MIMALLOC: "ON" volumes: &debian-volumes @@ -390,7 +394,7 @@ services: - apparmor:unconfined ulimits: *ulimits environment: - <<: [*common, *ccache, *sccache] + <<: [*common, *ccache, *sccache, *cpp] ARROW_ENABLE_TIMING_TESTS: # inherit ARROW_MIMALLOC: "ON" volumes: &ubuntu-volumes @@ -426,7 +430,7 @@ services: - apparmor:unconfined ulimits: *ulimits environment: - <<: [*common, *ccache, *sccache] + <<: [*common, *ccache, *sccache, *cpp] ARROW_HOME: /arrow ARROW_DEPENDENCY_SOURCE: BUNDLED LIBARROW_MINIMAL: "false" @@ -448,7 +452,7 @@ services: volumes: - .:/arrow:delegated environment: - <<: [*common, *ccache, *sccache] + <<: [*common, *ccache, *sccache, *cpp] ARROW_DEPENDENCY_SOURCE: BUNDLED ARROW_HOME: /arrow LIBARROW_MINIMAL: "false" @@ -470,7 +474,7 @@ services: shm_size: *shm-size ulimits: *ulimits environment: - <<: [*common, *ccache, *sccache] + <<: [*common, *ccache, *sccache, *cpp] ARROW_DEPENDENCY_SOURCE: BUNDLED CMAKE_GENERATOR: "Unix Makefiles" volumes: *ubuntu-volumes @@ -491,7 +495,7 @@ services: shm_size: *shm-size ulimits: *ulimits environment: - <<: [*common, *ccache, *sccache] + <<: [*common, *ccache, *sccache, *cpp] ARROW_BUILD_UTILITIES: "OFF" ARROW_COMPUTE: "OFF" ARROW_CSV: "OFF" @@ -538,7 +542,7 @@ services: shm_size: *shm-size ulimits: *ulimits environment: - <<: [*common, *ccache, *sccache] + <<: [*common, *ccache, *sccache, *cpp] ARROW_BUILD_UTILITIES: "OFF" ARROW_COMPUTE: "OFF" ARROW_CSV: "OFF" @@ -588,7 +592,7 @@ services: shm_size: *shm-size volumes: *ubuntu-volumes environment: - <<: [*common, *ccache] + <<: [*common, *ccache, *cpp] CC: clang-${CLANG_TOOLS} CXX: clang++-${CLANG_TOOLS} # Avoid creating huge static libraries @@ -630,7 +634,7 @@ services: shm_size: *shm-size volumes: *ubuntu-volumes environment: - <<: [*common, *ccache, *sccache] + <<: [*common, *ccache, *sccache, *cpp] CC: clang-${CLANG_TOOLS} CXX: clang++-${CLANG_TOOLS} ARROW_BUILD_STATIC: "OFF" @@ -662,7 +666,7 @@ services: shm_size: *shm-size ulimits: *ulimits environment: - <<: [*common, *ccache, *sccache] + <<: [*common, *ccache, *sccache, *cpp] ARROW_ENABLE_TIMING_TESTS: # inherit ARROW_MIMALLOC: "ON" Protobuf_SOURCE: "BUNDLED" # Need Protobuf >= 3.15 From be2014a9ebfb9570b016a5f0beda11022ace45d1 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 19 Jul 2023 12:32:27 +0200 Subject: [PATCH 013/749] GH-36767: [C++][CI] Fix test failure on i386 (#36769) ### Rationale for this change Our nightly build on i386 Debian (a 32-bit x86 build) fails with an error in decimal-to-read tests: https://github.com/ursacomputing/crossbow/actions/runs/5593823296/jobs/10227952675#step:6:3255 ### What changes are included in this PR? Improve error messages by displaying the actual and expected values. A side effect of this error message improvement is to... fix the test failure, as storing the computation result in a local variable seems to change the computed absolute difference. This is probably due to x86 FPU rounding shenanigans, as explained here: https://stackoverflow.com/questions/37626687/c-fundamentals-double-variable-not-equal-to-double-expression ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * Closes: #36767 Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/arrow/util/decimal_test.cc | 33 +++++++++++++++++++----------- 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/cpp/src/arrow/util/decimal_test.cc b/cpp/src/arrow/util/decimal_test.cc index 6376a9545a0f8..0a8b7a09730bf 100644 --- a/cpp/src/arrow/util/decimal_test.cc +++ b/cpp/src/arrow/util/decimal_test.cc @@ -1046,42 +1046,51 @@ using ToDoubleTestParam = ToRealTestParam; template void CheckDecimalToReal(const std::string& decimal_value, int32_t scale, Real expected) { Decimal dec(decimal_value); - ASSERT_EQ(dec.template ToReal(scale), expected) - << "Decimal value: " << decimal_value << " Scale: " << scale; + Real actual = dec.template ToReal(scale); + ASSERT_EQ(actual, expected) << "Decimal value: " << decimal_value + << ", scale: " << scale << ", expected: " << expected + << ", actual: " << actual; } template void CheckDecimalToRealWithinOneULP(const std::string& decimal_value, int32_t scale, Real expected) { Decimal dec(decimal_value); - auto result = dec.template ToReal(scale); - ASSERT_TRUE(result == expected || result == std::nextafter(expected, expected + 1) || - result == std::nextafter(expected, expected - 1)) - << "Decimal value: " << decimal_value << " Scale: " << scale; + Real actual = dec.template ToReal(scale); + ASSERT_TRUE(actual == expected || actual == std::nextafter(expected, expected + 1) || + actual == std::nextafter(expected, expected - 1)) + << "Decimal value: " << decimal_value << ", scale: " << scale + << ", expected: " << expected << ", actual: " << actual; } template void CheckDecimalToRealWithinEpsilon(const std::string& decimal_value, int32_t scale, Real epsilon, Real expected) { Decimal dec(decimal_value); - ASSERT_TRUE(std::abs(dec.template ToReal(scale) - expected) <= epsilon) - << "Decimal value: " << decimal_value << " Scale: " << scale; + Real actual = dec.template ToReal(scale); + ASSERT_LE(std::abs(actual - expected), epsilon) + << "Decimal value: " << decimal_value << ", scale: " << scale + << ", expected: " << expected << ", actual: " << actual; } template void CheckDecimalToRealApprox(const std::string& decimal_value, int32_t scale, float expected) { Decimal dec(decimal_value); - ASSERT_FLOAT_EQ(dec.template ToReal(scale), expected) - << "Decimal value: " << decimal_value << " Scale: " << scale; + float actual = dec.template ToReal(scale); + ASSERT_FLOAT_EQ(actual, expected) + << "Decimal value: " << decimal_value << ", scale: " << scale + << ", expected: " << expected << ", actual: " << actual; } template void CheckDecimalToRealApprox(const std::string& decimal_value, int32_t scale, double expected) { Decimal dec(decimal_value); - ASSERT_DOUBLE_EQ(dec.template ToReal(scale), expected) - << "Decimal value: " << decimal_value << " Scale: " << scale; + double actual = dec.template ToReal(scale); + ASSERT_DOUBLE_EQ(actual, expected) + << "Decimal value: " << decimal_value << ", scale: " << scale + << ", expected: " << expected << ", actual: " << actual; } // Common tests for Decimal128::ToReal and Decimal256::ToReal From bebd2bf693901b56d54cd3c0dfd0d48a7ed3c0bf Mon Sep 17 00:00:00 2001 From: Curt Hagenlocher Date: Wed, 19 Jul 2023 11:45:18 -0700 Subject: [PATCH 014/749] GH-34620: [C#] Support DateOnly and TimeOnly on .NET 6.0+ (#36125) ### What changes are included in this PR? Date32Array and Date64Array now support DateOnly values for construction and reading on .NET 6.0 and later. Time32Array and Time64Array now support TimeOnly values for construction and reading on .NET 6.0 and later. A new TimeArrayBuilder type is used to share logic between Time32Array.Builder and Time64Array.Builder just as the DateArrayBuilder does for the date array types. ### Are these changes tested? Yes * Closes: #34620 Authored-by: Curt Hagenlocher Signed-off-by: Weston Pace --- csharp/src/Apache.Arrow/Arrays/Date32Array.cs | 26 +++ csharp/src/Apache.Arrow/Arrays/Date64Array.cs | 23 +++ .../Apache.Arrow/Arrays/DateArrayBuilder.cs | 70 ++++++++ csharp/src/Apache.Arrow/Arrays/Time32Array.cs | 61 +++++-- csharp/src/Apache.Arrow/Arrays/Time64Array.cs | 65 +++++++- .../Apache.Arrow/Arrays/TimeArrayBuilder.cs | 152 ++++++++++++++++++ .../Apache.Arrow.Tests.csproj | 6 +- .../Apache.Arrow.Tests/ArrayBuilderTests.cs | 8 +- .../Apache.Arrow.Tests/ArrowArrayTests.cs | 8 +- .../Apache.Arrow.Tests/Date32ArrayTests.cs | 32 ++++ .../Apache.Arrow.Tests/Date64ArrayTests.cs | 33 ++++ .../Apache.Arrow.Tests/TestDateAndTimeData.cs | 5 + .../test/Apache.Arrow.Tests/TimeOnlyTests.cs | 111 +++++++++++++ 13 files changed, 574 insertions(+), 26 deletions(-) create mode 100644 csharp/src/Apache.Arrow/Arrays/TimeArrayBuilder.cs create mode 100644 csharp/test/Apache.Arrow.Tests/TimeOnlyTests.cs diff --git a/csharp/src/Apache.Arrow/Arrays/Date32Array.cs b/csharp/src/Apache.Arrow/Arrays/Date32Array.cs index 35c0065e11907..23ad7356eb322 100644 --- a/csharp/src/Apache.Arrow/Arrays/Date32Array.cs +++ b/csharp/src/Apache.Arrow/Arrays/Date32Array.cs @@ -25,6 +25,9 @@ namespace Apache.Arrow public class Date32Array : PrimitiveArray { private static readonly DateTime _epochDate = new DateTime(1970, 1, 1, 0, 0, 0, DateTimeKind.Unspecified); +#if NET6_0_OR_GREATER + private static readonly int _epochDayNumber = new DateOnly(1970, 1, 1).DayNumber; +#endif /// /// The class can be used to fluently build objects. @@ -57,6 +60,13 @@ protected override int Convert(DateTimeOffset dateTimeOffset) // DateTimeOffset.Date property. return (int)(dateTimeOffset.UtcDateTime.Date - _epochDate).TotalDays; } + +#if NET6_0_OR_GREATER + protected override int Convert(DateOnly date) + { + return (int)(date.DayNumber - _epochDayNumber); + } +#endif } public Date32Array( @@ -108,5 +118,21 @@ public Date32Array(ArrayData data) ? new DateTimeOffset(_epochDate.AddDays(value.Value), TimeSpan.Zero) : default(DateTimeOffset?); } + +#if NET6_0_OR_GREATER + /// + /// Get the date at the specified index + /// + /// Index at which to get the date. + /// Returns a , or null if there is no object at that index. + /// + public DateOnly? GetDateOnly(int index) + { + int? value = GetValue(index); + return value.HasValue + ? DateOnly.FromDayNumber(_epochDayNumber + value.Value) + : default(DateOnly?); + } +#endif } } diff --git a/csharp/src/Apache.Arrow/Arrays/Date64Array.cs b/csharp/src/Apache.Arrow/Arrays/Date64Array.cs index cf977b2e4969a..b0d42e27bbd23 100644 --- a/csharp/src/Apache.Arrow/Arrays/Date64Array.cs +++ b/csharp/src/Apache.Arrow/Arrays/Date64Array.cs @@ -69,6 +69,13 @@ protected override long Convert(DateTimeOffset dateTimeOffset) long days = millis / MillisecondsPerDay; return (millis < 0 ? days - 1 : days) * MillisecondsPerDay; } + +#if NET6_0_OR_GREATER + protected override long Convert(DateOnly date) + { + return ((long)date.DayNumber - _epochDayNumber) * MillisecondsPerDay; + } +#endif } public Date64Array(ArrayData data) @@ -113,5 +120,21 @@ public Date64Array(ArrayData data) ? DateTimeOffset.FromUnixTimeMilliseconds(value.Value) : default(DateTimeOffset?); } + +#if NET6_0_OR_GREATER + /// + /// Get the date at the specified index + /// + /// Index at which to get the date. + /// Returns a , or null if there is no object at that index. + /// + public DateOnly? GetDateOnly(int index) + { + long? value = GetValue(index); + return value.HasValue + ? DateOnly.FromDateTime(DateTimeOffset.FromUnixTimeMilliseconds(value.Value).UtcDateTime) + : default(DateOnly?); + } +#endif } } diff --git a/csharp/src/Apache.Arrow/Arrays/DateArrayBuilder.cs b/csharp/src/Apache.Arrow/Arrays/DateArrayBuilder.cs index 4e69f6fe3e7e1..dcbb76930b6d3 100644 --- a/csharp/src/Apache.Arrow/Arrays/DateArrayBuilder.cs +++ b/csharp/src/Apache.Arrow/Arrays/DateArrayBuilder.cs @@ -28,9 +28,16 @@ public abstract class DateArrayBuilder : DelegatingArrayBuilder, IArrowArrayBuilder, IArrowArrayBuilder +#if NET6_0_OR_GREATER + , IArrowArrayBuilder +#endif where TArray : IArrowArray where TBuilder : class, IArrowArrayBuilder { +#if NET6_0_OR_GREATER + protected static readonly long _epochDayNumber = new DateOnly(1970, 1, 1).DayNumber; +#endif + /// /// Construct a new instance of the class. /// @@ -72,6 +79,20 @@ public TBuilder Append(DateTimeOffset value) return this as TBuilder; } +#if NET6_0_OR_GREATER + /// + /// Append a date from a object to the array. + /// + /// + /// Date to add. + /// Returns the builder (for fluent-style composition). + public TBuilder Append(DateOnly value) + { + InnerBuilder.Append(Convert(value)); + return this as TBuilder; + } +#endif + /// /// Append a span of dates in the form of objects to the array. /// @@ -114,6 +135,24 @@ public TBuilder Append(ReadOnlySpan span) return this as TBuilder; } +#if NET6_0_OR_GREATER + /// + /// Append a span of dates in the form of objects to the array. + /// + /// Span of dates to add. + /// Returns the builder (for fluent-style composition). + public TBuilder Append(ReadOnlySpan span) + { + InnerBuilder.Reserve(span.Length); + foreach (var item in span) + { + InnerBuilder.Append(Convert(item)); + } + + return this as TBuilder; + } +#endif + /// /// Append a null date to the array. /// @@ -156,6 +195,19 @@ public TBuilder AppendRange(IEnumerable values) return this as TBuilder; } +#if NET6_0_OR_GREATER + /// + /// Append a collection of dates in the form of objects to the array. + /// + /// Collection of dates to add. + /// Returns the builder (for fluent-style composition). + public TBuilder AppendRange(IEnumerable values) + { + InnerBuilder.AppendRange(values.Select(Convert)); + return this as TBuilder; + } +#endif + /// /// Set the value of a date in the form of a object at the specified index. /// @@ -190,6 +242,20 @@ public TBuilder Set(int index, DateTimeOffset value) return this as TBuilder; } +#if NET6_0_OR_GREATER + /// + /// Set the value of a date in the form of a object at the specified index. + /// + /// Index at which to set value. + /// Date to set. + /// Returns the builder (for fluent-style composition). + public TBuilder Set(int index, DateOnly value) + { + InnerBuilder.Set(index, Convert(value)); + return this as TBuilder; + } +#endif + /// /// Swap the values of the dates at the specified indices. /// @@ -205,5 +271,9 @@ public TBuilder Swap(int i, int j) protected abstract TUnderlying Convert(DateTime dateTime); protected abstract TUnderlying Convert(DateTimeOffset dateTimeOffset); + +#if NET6_0_OR_GREATER + protected abstract TUnderlying Convert(DateOnly date); +#endif } } diff --git a/csharp/src/Apache.Arrow/Arrays/Time32Array.cs b/csharp/src/Apache.Arrow/Arrays/Time32Array.cs index bdaf64d5561c7..31d17d06a1e40 100644 --- a/csharp/src/Apache.Arrow/Arrays/Time32Array.cs +++ b/csharp/src/Apache.Arrow/Arrays/Time32Array.cs @@ -14,6 +14,7 @@ // limitations under the License. using Apache.Arrow.Types; +using System; using System.IO; namespace Apache.Arrow @@ -27,14 +28,19 @@ public class Time32Array : PrimitiveArray /// /// The class can be used to fluently build objects. /// - public class Builder : PrimitiveArrayBuilder + public class Builder : TimeArrayBuilder { - protected override Time32Array Build( - ArrowBuffer valueBuffer, ArrowBuffer nullBitmapBuffer, - int length, int nullCount, int offset) => - new Time32Array(DataType, valueBuffer, nullBitmapBuffer, length, nullCount, offset); - - protected Time32Type DataType { get; } + private class TimeBuilder : PrimitiveArrayBuilder + { + public Time32Type DataType { get; } + + public TimeBuilder(Time32Type dataType) => DataType = dataType; + + protected override Time32Array Build( + ArrowBuffer valueBuffer, ArrowBuffer nullBitmapBuffer, + int length, int nullCount, int offset) => + new Time32Array(DataType, valueBuffer, nullBitmapBuffer, length, nullCount, offset); + } public Builder() : this(Time32Type.Default) { } @@ -46,10 +52,22 @@ public Builder(TimeUnit unit) /// Construct a new instance of the class. /// public Builder(Time32Type type) - : base() + : base(new TimeBuilder(type)) + { + } + +#if NET6_0_OR_GREATER + protected override int Convert(TimeOnly time) { - DataType = type; + var unit = ((TimeBuilder)InnerBuilder).DataType.Unit; + return unit switch + { + TimeUnit.Second => (int)(time.Ticks / TimeSpan.TicksPerSecond), + TimeUnit.Millisecond => (int)(time.Ticks / TimeSpan.TicksPerMillisecond), + _ => throw new InvalidDataException($"Unsupported time unit for Time32Type: {unit}") + }; } +#endif } public Time32Array( @@ -113,5 +131,30 @@ public Time32Array(ArrayData data) _ => throw new InvalidDataException($"Unsupported time unit for Time32Type: {unit}") }; } + +#if NET6_0_OR_GREATER + /// + /// Get the time at the specified index as + /// + /// Index at which to get the time. + /// Returns a , or null if there is no object at that index. + /// + public TimeOnly? GetTime(int index) + { + int? value = GetValue(index); + if (value == null) + { + return null; + } + + var unit = ((Time32Type)Data.DataType).Unit; + return unit switch + { + TimeUnit.Second => new TimeOnly(value.Value * TimeSpan.TicksPerSecond), + TimeUnit.Millisecond => new TimeOnly(value.Value * TimeSpan.TicksPerMillisecond), + _ => throw new InvalidDataException($"Unsupported time unit for Time32Type: {unit}") + }; + } +#endif } } diff --git a/csharp/src/Apache.Arrow/Arrays/Time64Array.cs b/csharp/src/Apache.Arrow/Arrays/Time64Array.cs index 127db63a7e09d..95faf18fe9e61 100644 --- a/csharp/src/Apache.Arrow/Arrays/Time64Array.cs +++ b/csharp/src/Apache.Arrow/Arrays/Time64Array.cs @@ -14,6 +14,7 @@ // limitations under the License. using Apache.Arrow.Types; +using System; using System.IO; namespace Apache.Arrow @@ -24,17 +25,25 @@ namespace Apache.Arrow /// public class Time64Array : PrimitiveArray { + private const long TicksPerMicrosecond = 10; + private const long NanosecondsPerTick = 100; + /// /// The class can be used to fluently build objects. /// - public class Builder : PrimitiveArrayBuilder + public class Builder : TimeArrayBuilder { - protected override Time64Array Build( - ArrowBuffer valueBuffer, ArrowBuffer nullBitmapBuffer, - int length, int nullCount, int offset) => - new Time64Array(DataType, valueBuffer, nullBitmapBuffer, length, nullCount, offset); + private class TimeBuilder : PrimitiveArrayBuilder + { + public Time64Type DataType { get; } - protected Time64Type DataType { get; } + public TimeBuilder(Time64Type dataType) => DataType = dataType; + + protected override Time64Array Build( + ArrowBuffer valueBuffer, ArrowBuffer nullBitmapBuffer, + int length, int nullCount, int offset) => + new Time64Array(DataType, valueBuffer, nullBitmapBuffer, length, nullCount, offset); + } public Builder() : this(Time64Type.Default) { } @@ -46,10 +55,22 @@ public Builder(TimeUnit unit) /// Construct a new instance of the class. /// public Builder(Time64Type type) - : base() + : base(new TimeBuilder(type)) + { + } + +#if NET6_0_OR_GREATER + protected override long Convert(TimeOnly time) { - DataType = type; + var unit = ((TimeBuilder)InnerBuilder).DataType.Unit; + return unit switch + { + TimeUnit.Microsecond => (long)(time.Ticks / TicksPerMicrosecond), + TimeUnit.Nanosecond => (long)(time.Ticks * NanosecondsPerTick), + _ => throw new InvalidDataException($"Unsupported time unit for Time32Type: {unit}") + }; } +#endif } public Time64Array( @@ -113,5 +134,33 @@ public Time64Array(ArrayData data) _ => throw new InvalidDataException($"Unsupported time unit for Time64Type: {unit}") }; } + +#if NET6_0_OR_GREATER + /// + /// Get the time at the specified index as + /// + /// + /// This may cause truncation of nanosecond values, as the resolution of TimeOnly is in 100-ns increments. + /// + /// Index at which to get the time. + /// Returns a , or null if there is no object at that index. + /// + public TimeOnly? GetTime(int index) + { + long? value = GetValue(index); + if (value == null) + { + return null; + } + + var unit = ((Time64Type)Data.DataType).Unit; + return unit switch + { + TimeUnit.Microsecond => new TimeOnly(value.Value * TicksPerMicrosecond), + TimeUnit.Nanosecond => new TimeOnly(value.Value / NanosecondsPerTick), + _ => throw new InvalidDataException($"Unsupported time unit for Time64Type: {unit}") + }; + } +#endif } } diff --git a/csharp/src/Apache.Arrow/Arrays/TimeArrayBuilder.cs b/csharp/src/Apache.Arrow/Arrays/TimeArrayBuilder.cs new file mode 100644 index 0000000000000..da93db84717da --- /dev/null +++ b/csharp/src/Apache.Arrow/Arrays/TimeArrayBuilder.cs @@ -0,0 +1,152 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Collections.Generic; +using System.Linq; + +namespace Apache.Arrow +{ + public abstract class TimeArrayBuilder : + DelegatingArrayBuilder +#if NET6_0_OR_GREATER + , IArrowArrayBuilder +#endif + where TArray : IArrowArray + where TBuilder : class, IArrowArrayBuilder + { + /// + /// Construct a new instance of the class. + /// + /// Inner builder that will produce arrays of type . + /// + protected TimeArrayBuilder(IArrowArrayBuilder> innerBuilder) + : base(innerBuilder) + { } + +#if NET6_0_OR_GREATER + /// + /// Append a time in the form of a object to the array. + /// + /// Time to add. + /// Returns the builder (for fluent-style composition). + public TBuilder Append(TimeOnly value) + { + InnerBuilder.Append(Convert(value)); + return this as TBuilder; + } +#endif + + /// + /// Append a time + /// + /// + /// + public TBuilder Append(TUnderlying value) + { + InnerBuilder.Append(value); + return this as TBuilder; + } + +#if NET6_0_OR_GREATER + /// + /// Append a span of times in the form of objects to the array. + /// + /// Span of times to add. + /// Returns the builder (for fluent-style composition). + public TBuilder Append(ReadOnlySpan span) + { + InnerBuilder.Reserve(span.Length); + foreach (var item in span) + { + InnerBuilder.Append(Convert(item)); + } + + return this as TBuilder; + } +#endif + + public TBuilder Append(ReadOnlySpan values) + { + InnerBuilder.Append(values); + return this as TBuilder; + } + + /// + /// Append a null time to the array. + /// + /// Returns the builder (for fluent-style composition). + public TBuilder AppendNull() + { + InnerBuilder.AppendNull(); + return this as TBuilder; + } + +#if NET6_0_OR_GREATER + /// + /// Append a collection of times in the form of objects to the array. + /// + /// Collection of times to add. + /// Returns the builder (for fluent-style composition). + public TBuilder AppendRange(IEnumerable values) + { + InnerBuilder.AppendRange(values.Select(Convert)); + return this as TBuilder; + } +#endif + + public TBuilder AppendRange(IEnumerable values) + { + InnerBuilder.AppendRange(values); + return this as TBuilder; + } + +#if NET6_0_OR_GREATER + /// + /// Set the value of a time in the form of a object at the specified index. + /// + /// Index at which to set value. + /// Time to set. + /// Returns the builder (for fluent-style composition). + public TBuilder Set(int index, TimeOnly value) + { + InnerBuilder.Set(index, Convert(value)); + return this as TBuilder; + } +#endif + + public TBuilder Set(int index, TUnderlying value) + { + InnerBuilder.Set(index, value); + return this as TBuilder; + } + + /// + /// Swap the values of the times at the specified indices. + /// + /// First index. + /// Second index. + /// Returns the builder (for fluent-style composition). + public TBuilder Swap(int i, int j) + { + InnerBuilder.Swap(i, j); + return this as TBuilder; + } + +#if NET6_0_OR_GREATER + protected abstract TUnderlying Convert(TimeOnly time); +#endif + } +} diff --git a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj index cdbfe479470a4..55005a91c74a1 100644 --- a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj +++ b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj @@ -1,4 +1,4 @@ - + @@ -21,4 +21,8 @@ + + + + \ No newline at end of file diff --git a/csharp/test/Apache.Arrow.Tests/ArrayBuilderTests.cs b/csharp/test/Apache.Arrow.Tests/ArrayBuilderTests.cs index 9ac2f779a6f69..0c40fd82af7ce 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrayBuilderTests.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrayBuilderTests.cs @@ -40,8 +40,8 @@ public void PrimitiveArrayBuildersProduceExpectedArray() Test(); Test(); Test(); - Test(); - Test(); + TestArrayBuilder(x => x.Append(10).Append(20).Append(30)); + TestArrayBuilder(x => x.Append(10).Append(20).Append(30)); static void Test() where T : struct, INumber @@ -64,8 +64,8 @@ public void PrimitiveArrayBuildersProduceExpectedArrayWithNulls() Test(); Test(); Test(); - Test(); - Test(); + TestArrayBuilder(x => x.Append(123).AppendNull().AppendNull().Append(127), 4, 2, 0x9); + TestArrayBuilder(x => x.Append(123).AppendNull().AppendNull().Append(127), 4, 2, 0x9); static void Test() where T : struct, INumber diff --git a/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs b/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs index af3e0f80e6473..16fca684ff5ec 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs @@ -109,8 +109,8 @@ public void SliceArray() TestNumberSlice(); TestSlice(x => x.Append(new DateTime(2019, 1, 1)).Append(new DateTime(2019, 1, 2)).Append(new DateTime(2019, 1, 3))); TestSlice(x => x.Append(new DateTime(2019, 1, 1)).Append(new DateTime(2019, 1, 2)).Append(new DateTime(2019, 1, 3))); - TestNumberSlice(); - TestNumberSlice(); + TestSlice(x => x.Append(10).Append(20).Append(30)); + TestSlice(x => x.Append(10).Append(20).Append(30)); TestSlice(x => x.Append("10").Append("20").Append("30")); static void TestNumberSlice() @@ -136,8 +136,8 @@ public void SlicePrimitiveArrayWithNulls() TestNumberSlice(); TestSlice(x => x.Append(new DateTime(2019, 1, 1)).Append(new DateTime(2019, 1, 2)).AppendNull().Append(new DateTime(2019, 1, 3))); TestSlice(x => x.Append(new DateTime(2019, 1, 1)).Append(new DateTime(2019, 1, 2)).AppendNull().Append(new DateTime(2019, 1, 3))); - TestNumberSlice(); - TestNumberSlice(); + TestSlice(x => x.Append(10).Append(20).AppendNull().Append(30)); + TestSlice(x => x.Append(10).Append(20).AppendNull().Append(30)); static void TestNumberSlice() where T : struct, INumber diff --git a/csharp/test/Apache.Arrow.Tests/Date32ArrayTests.cs b/csharp/test/Apache.Arrow.Tests/Date32ArrayTests.cs index 0d6aad96e5dfd..2a674b942c17b 100644 --- a/csharp/test/Apache.Arrow.Tests/Date32ArrayTests.cs +++ b/csharp/test/Apache.Arrow.Tests/Date32ArrayTests.cs @@ -31,6 +31,11 @@ public static IEnumerable GetDateTimesData() => public static IEnumerable GetDateTimeOffsetsData() => TestDateAndTimeData.ExampleDateTimeOffsets.Select(dto => new object[] { dto }); +#if NET6_0_OR_GREATER + public static IEnumerable GetDateOnlyData() => + TestDateAndTimeData.ExampleDates.Select(d => new object[] { DateOnly.FromDateTime(d) }); +#endif + public class AppendNull { [Fact] @@ -121,5 +126,32 @@ public void AppendGivesUtcDate(DateTimeOffset dateTimeOffset) Assert.Equal(expectedValue, array.GetValue(0)); } } + +#if NET6_0_OR_GREATER + public class AppendDateOnly + { + [Theory] + [MemberData(nameof(GetDateOnlyData), MemberType = typeof(Date64ArrayTests))] + public void AppendDateGivesSameDate(DateOnly date) + { + // Arrange + var builder = new Date32Array.Builder(); + var expectedDateTime = date.ToDateTime(TimeOnly.MinValue); + var expectedDateTimeOffset = new DateTimeOffset(expectedDateTime, TimeSpan.Zero); + int expectedValue = date.DayNumber - new DateOnly(1970, 1, 1).DayNumber; + + // Act + builder = builder.Append(date); + + // Assert + var array = builder.Build(); + Assert.Equal(1, array.Length); + Assert.Equal(date, array.GetDateOnly(0)); + Assert.Equal(expectedDateTime, array.GetDateTime(0)); + Assert.Equal(expectedDateTimeOffset, array.GetDateTimeOffset(0)); + Assert.Equal(expectedValue, array.GetValue(0)); + } + } +#endif } } diff --git a/csharp/test/Apache.Arrow.Tests/Date64ArrayTests.cs b/csharp/test/Apache.Arrow.Tests/Date64ArrayTests.cs index 65cffc84e5555..22ae08a617c48 100644 --- a/csharp/test/Apache.Arrow.Tests/Date64ArrayTests.cs +++ b/csharp/test/Apache.Arrow.Tests/Date64ArrayTests.cs @@ -33,6 +33,11 @@ public static IEnumerable GetDateTimesData() => public static IEnumerable GetDateTimeOffsetsData() => TestDateAndTimeData.ExampleDateTimeOffsets.Select(dto => new object[] { dto }); +#if NET6_0_OR_GREATER + public static IEnumerable GetDateOnlyData() => + TestDateAndTimeData.ExampleDates.Select(d => new object[] { DateOnly.FromDateTime(d) }); +#endif + public class AppendNull { [Fact] @@ -129,5 +134,33 @@ public void AppendGivesUtcDate(DateTimeOffset dateTimeOffset) Assert.Equal(0, array.GetValue(0).Value % MillisecondsPerDay); } } + +#if NET6_0_OR_GREATER + public class AppendDateOnly + { + [Theory] + [MemberData(nameof(GetDateOnlyData), MemberType = typeof(Date64ArrayTests))] + public void AppendDateGivesSameDate(DateOnly date) + { + // Arrange + var builder = new Date64Array.Builder(); + var expectedDateTime = date.ToDateTime(TimeOnly.MinValue); + var expectedDateTimeOffset = new DateTimeOffset(expectedDateTime, TimeSpan.Zero); + long expectedValue = (date.DayNumber - new DateOnly(1970, 1, 1).DayNumber) * MillisecondsPerDay; + + // Act + builder = builder.Append(date); + + // Assert + var array = builder.Build(); + Assert.Equal(1, array.Length); + Assert.Equal(date, array.GetDateOnly(0)); + Assert.Equal(expectedDateTime, array.GetDateTime(0)); + Assert.Equal(expectedDateTimeOffset, array.GetDateTimeOffset(0)); + Assert.Equal(expectedValue, array.GetValue(0)); + Assert.Equal(0, array.GetValue(0).Value % MillisecondsPerDay); + } + } +#endif } } diff --git a/csharp/test/Apache.Arrow.Tests/TestDateAndTimeData.cs b/csharp/test/Apache.Arrow.Tests/TestDateAndTimeData.cs index 1f2eae45b039c..c258fdd2d6988 100644 --- a/csharp/test/Apache.Arrow.Tests/TestDateAndTimeData.cs +++ b/csharp/test/Apache.Arrow.Tests/TestDateAndTimeData.cs @@ -59,6 +59,11 @@ from date in _exampleDates from kind in _exampleKinds select DateTime.SpecifyKind(date, kind); + /// + /// Gets a collection of example times + /// + public static IEnumerable ExampleTimes => _exampleTimes; + /// /// Gets a collection of example date/times, of all different kinds. /// diff --git a/csharp/test/Apache.Arrow.Tests/TimeOnlyTests.cs b/csharp/test/Apache.Arrow.Tests/TimeOnlyTests.cs new file mode 100644 index 0000000000000..cd66530a0e935 --- /dev/null +++ b/csharp/test/Apache.Arrow.Tests/TimeOnlyTests.cs @@ -0,0 +1,111 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Collections.Generic; +using System.Linq; +using Apache.Arrow.Types; +using Xunit; + +namespace Apache.Arrow.Tests +{ + public class TimeOnlyTests + { + private static IEnumerable GetTimeOnlyData(params TimeUnit[] units) => + from time in TestDateAndTimeData.ExampleTimes + from unit in units + select new object[] { TimeOnly.FromTimeSpan(time), unit }; + + public class Time32 + { + public static IEnumerable GetTestData => GetTimeOnlyData(TimeUnit.Second, TimeUnit.Millisecond); + + [Fact] + public void AppendThenGetGivesNull() + { + // Arrange + var builder = new Time32Array.Builder(); + + // Act + builder = builder.AppendNull(); + + // Assert + var array = builder.Build(); + Assert.Equal(1, array.Length); + Assert.Null(array.GetTime(0)); + Assert.Null(array.GetValue(0)); + } + + [Theory] + [MemberData(nameof(GetTestData))] + public void AppendTimeGivesSameTime(TimeOnly time, TimeUnit timeUnit) + { + // Arrange + var builder = new Time32Array.Builder(timeUnit); + var expectedTime = time; + int expectedMilliseconds = (int)(time.Ticks / TimeSpan.TicksPerMillisecond); + + // Act + builder = builder.Append(time); + + // Assert + var array = builder.Build(); + Assert.Equal(1, array.Length); + Assert.Equal(expectedTime, array.GetTime(0)); + Assert.Equal(expectedMilliseconds, array.GetMilliSeconds(0)); + } + } + + public class Time64 + { + public static IEnumerable GetTestData => GetTimeOnlyData(TimeUnit.Microsecond, TimeUnit.Nanosecond); + + [Fact] + public void AppendThenGetGivesNull() + { + // Arrange + var builder = new Time64Array.Builder(); + + // Act + builder = builder.AppendNull(); + + // Assert + var array = builder.Build(); + Assert.Equal(1, array.Length); + Assert.Null(array.GetTime(0)); + Assert.Null(array.GetValue(0)); + } + + [Theory] + [MemberData(nameof(GetTestData))] + public void AppendTimeGivesSameTime(TimeOnly time, TimeUnit timeUnit) + { + // Arrange + var builder = new Time64Array.Builder(timeUnit); + var expectedTime = time; + long expectedNanoseconds = time.Ticks * TimeSpan.NanosecondsPerTick; + + // Act + builder = builder.Append(time); + + // Assert + var array = builder.Build(); + Assert.Equal(1, array.Length); + Assert.Equal(expectedTime, array.GetTime(0)); + Assert.Equal(expectedNanoseconds, array.GetNanoSeconds(0)); + } + } + } +} From 3f0b62038d7ddac090db316722f5a1695e85701b Mon Sep 17 00:00:00 2001 From: Ivan Chesnov Date: Wed, 19 Jul 2023 23:40:47 +0300 Subject: [PATCH 015/749] GH-36433: [C++] Update fast_float version to 3.10.1 (#36434) ### Rationale for this change Need this for parsing Infinity values with + sign. ### What changes are included in this PR? updated version of fast_float to version 3.10.1 (used this version because in higher versions c++ 20 started using that cause a lot of build errors) ### Are these changes tested? in scope of fast_float. ### Are there any user-facing changes? no * Closes: #36433 Lead-authored-by: Ivan Chesnov Co-authored-by: Ivan Chesnov Co-authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- cpp/src/arrow/util/value_parsing.cc | 2 + cpp/src/arrow/util/value_parsing_test.cc | 6 + cpp/src/arrow/vendored/fast_float/README.md | 4 +- .../arrow/vendored/fast_float/ascii_number.h | 15 +- cpp/src/arrow/vendored/fast_float/bigint.h | 144 +- .../fast_float/constexpr_feature_detect.h | 40 + .../vendored/fast_float/decimal_to_binary.h | 17 +- .../vendored/fast_float/digit_comparison.h | 37 +- .../arrow/vendored/fast_float/fast_float.h | 2 +- .../arrow/vendored/fast_float/fast_table.h | 1313 +++++++++-------- .../arrow/vendored/fast_float/float_common.h | 179 ++- .../arrow/vendored/fast_float/parse_number.h | 32 + cpp/src/arrow/vendored/fast_float/update.sh | 2 +- 13 files changed, 991 insertions(+), 802 deletions(-) create mode 100644 cpp/src/arrow/vendored/fast_float/constexpr_feature_detect.h diff --git a/cpp/src/arrow/util/value_parsing.cc b/cpp/src/arrow/util/value_parsing.cc index 92495612a7df8..f6a24ac1467f8 100644 --- a/cpp/src/arrow/util/value_parsing.cc +++ b/cpp/src/arrow/util/value_parsing.cc @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +#define FASTFLOAT_ALLOWS_LEADING_PLUS 1 + #include "arrow/util/value_parsing.h" #include diff --git a/cpp/src/arrow/util/value_parsing_test.cc b/cpp/src/arrow/util/value_parsing_test.cc index 6f83b6dfa6592..30c5e6aae74ba 100644 --- a/cpp/src/arrow/util/value_parsing_test.cc +++ b/cpp/src/arrow/util/value_parsing_test.cc @@ -119,6 +119,9 @@ TEST(StringConversion, ToFloat) { AssertConversion("0", 0.0f); AssertConversion("-0.0", -0.0f); AssertConversion("-1e20", -1e20f); + AssertConversion("+Infinity", std::numeric_limits::infinity()); + AssertConversion("-Infinity", -std::numeric_limits::infinity()); + AssertConversion("Infinity", std::numeric_limits::infinity()); AssertConversionFails(""); AssertConversionFails("e"); @@ -135,6 +138,9 @@ TEST(StringConversion, ToDouble) { AssertConversion("0", 0); AssertConversion("-0.0", -0.0); AssertConversion("-1e100", -1e100); + AssertConversion("+Infinity", std::numeric_limits::infinity()); + AssertConversion("-Infinity", -std::numeric_limits::infinity()); + AssertConversion("Infinity", std::numeric_limits::infinity()); AssertConversionFails(""); AssertConversionFails("e"); diff --git a/cpp/src/arrow/vendored/fast_float/README.md b/cpp/src/arrow/vendored/fast_float/README.md index 6d44654f2a721..b07c280e0ad44 100644 --- a/cpp/src/arrow/vendored/fast_float/README.md +++ b/cpp/src/arrow/vendored/fast_float/README.md @@ -20,7 +20,7 @@ # fast_float The files in this directory are vendored from fast_float -git tag `v3.8.1`. +git tag `v3.10.1`. See https://github.com/fastfloat/fast_float @@ -31,7 +31,7 @@ See https://github.com/fastfloat/fast_float ## How to update You must replace `VERSION` in the command lines with suitable version -such as `3.8.1`. +such as `3.10.1`. ```bash cpp/src/arrow/vendoered/fast_float/update.sh VERSION diff --git a/cpp/src/arrow/vendored/fast_float/ascii_number.h b/cpp/src/arrow/vendored/fast_float/ascii_number.h index 24ec813174a7a..6d825ccfb5a48 100644 --- a/cpp/src/arrow/vendored/fast_float/ascii_number.h +++ b/cpp/src/arrow/vendored/fast_float/ascii_number.h @@ -13,9 +13,11 @@ namespace fast_float { // Next function can be micro-optimized, but compilers are entirely // able to optimize it well. -fastfloat_really_inline bool is_integer(char c) noexcept { return c >= '0' && c <= '9'; } +fastfloat_really_inline constexpr bool is_integer(char c) noexcept { + return c >= '0' && c <= '9'; +} -fastfloat_really_inline uint64_t byteswap(uint64_t val) { +fastfloat_really_inline constexpr uint64_t byteswap(uint64_t val) { return (val & 0xFF00000000000000) >> 56 | (val & 0x00FF000000000000) >> 40 | (val & 0x0000FF0000000000) >> 24 @@ -45,7 +47,8 @@ fastfloat_really_inline void write_u64(uint8_t *chars, uint64_t val) { } // credit @aqrit -fastfloat_really_inline uint32_t parse_eight_digits_unrolled(uint64_t val) { +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 +uint32_t parse_eight_digits_unrolled(uint64_t val) { const uint64_t mask = 0x000000FF000000FF; const uint64_t mul1 = 0x000F424000000064; // 100 + (1000000ULL << 32) const uint64_t mul2 = 0x0000271000000001; // 1 + (10000ULL << 32) @@ -60,7 +63,7 @@ fastfloat_really_inline uint32_t parse_eight_digits_unrolled(const char *chars) } // credit @aqrit -fastfloat_really_inline bool is_made_of_eight_digits_fast(uint64_t val) noexcept { +fastfloat_really_inline constexpr bool is_made_of_eight_digits_fast(uint64_t val) noexcept { return !((((val + 0x4646464646464646) | (val - 0x3030303030303030)) & 0x8080808080808080)); } @@ -94,7 +97,11 @@ parsed_number_string parse_number_string(const char *p, const char *pend, parse_ answer.valid = false; answer.too_many_digits = false; answer.negative = (*p == '-'); +#if FASTFLOAT_ALLOWS_LEADING_PLUS // disabled by default + if ((*p == '-') || (*p == '+')) { +#else if (*p == '-') { // C++17 20.19.3.(7.1) explicitly forbids '+' sign here +#endif ++p; if (p == pend) { return answer; diff --git a/cpp/src/arrow/vendored/fast_float/bigint.h b/cpp/src/arrow/vendored/fast_float/bigint.h index b733c7b64ba6a..bc083893ac4ca 100644 --- a/cpp/src/arrow/vendored/fast_float/bigint.h +++ b/cpp/src/arrow/vendored/fast_float/bigint.h @@ -51,27 +51,27 @@ struct stackvec { stackvec &operator=(stackvec &&other) = delete; // create stack vector from existing limb span. - stackvec(limb_span s) { + FASTFLOAT_CONSTEXPR20 stackvec(limb_span s) { FASTFLOAT_ASSERT(try_extend(s)); } - limb& operator[](size_t index) noexcept { + FASTFLOAT_CONSTEXPR14 limb& operator[](size_t index) noexcept { FASTFLOAT_DEBUG_ASSERT(index < length); return data[index]; } - const limb& operator[](size_t index) const noexcept { + FASTFLOAT_CONSTEXPR14 const limb& operator[](size_t index) const noexcept { FASTFLOAT_DEBUG_ASSERT(index < length); return data[index]; } // index from the end of the container - const limb& rindex(size_t index) const noexcept { + FASTFLOAT_CONSTEXPR14 const limb& rindex(size_t index) const noexcept { FASTFLOAT_DEBUG_ASSERT(index < length); size_t rindex = length - index - 1; return data[rindex]; } // set the length, without bounds checking. - void set_len(size_t len) noexcept { + FASTFLOAT_CONSTEXPR14 void set_len(size_t len) noexcept { length = uint16_t(len); } constexpr size_t len() const noexcept { @@ -84,12 +84,12 @@ struct stackvec { return size; } // append item to vector, without bounds checking - void push_unchecked(limb value) noexcept { + FASTFLOAT_CONSTEXPR14 void push_unchecked(limb value) noexcept { data[length] = value; length++; } // append item to vector, returning if item was added - bool try_push(limb value) noexcept { + FASTFLOAT_CONSTEXPR14 bool try_push(limb value) noexcept { if (len() < capacity()) { push_unchecked(value); return true; @@ -98,13 +98,13 @@ struct stackvec { } } // add items to the vector, from a span, without bounds checking - void extend_unchecked(limb_span s) noexcept { + FASTFLOAT_CONSTEXPR20 void extend_unchecked(limb_span s) noexcept { limb* ptr = data + length; - ::memcpy((void*)ptr, (const void*)s.ptr, sizeof(limb) * s.len()); + std::copy_n(s.ptr, s.len(), ptr); set_len(len() + s.len()); } // try to add items to the vector, returning if items were added - bool try_extend(limb_span s) noexcept { + FASTFLOAT_CONSTEXPR20 bool try_extend(limb_span s) noexcept { if (len() + s.len() <= capacity()) { extend_unchecked(s); return true; @@ -115,6 +115,7 @@ struct stackvec { // resize the vector, without bounds checking // if the new size is longer than the vector, assign value to each // appended item. + FASTFLOAT_CONSTEXPR20 void resize_unchecked(size_t new_len, limb value) noexcept { if (new_len > len()) { size_t count = new_len - len(); @@ -127,7 +128,7 @@ struct stackvec { } } // try to resize the vector, returning if the vector was resized. - bool try_resize(size_t new_len, limb value) noexcept { + FASTFLOAT_CONSTEXPR20 bool try_resize(size_t new_len, limb value) noexcept { if (new_len > capacity()) { return false; } else { @@ -138,7 +139,7 @@ struct stackvec { // check if any limbs are non-zero after the given index. // this needs to be done in reverse order, since the index // is relative to the most significant limbs. - bool nonzero(size_t index) const noexcept { + FASTFLOAT_CONSTEXPR14 bool nonzero(size_t index) const noexcept { while (index < len()) { if (rindex(index) != 0) { return true; @@ -148,27 +149,27 @@ struct stackvec { return false; } // normalize the big integer, so most-significant zero limbs are removed. - void normalize() noexcept { + FASTFLOAT_CONSTEXPR14 void normalize() noexcept { while (len() > 0 && rindex(0) == 0) { length--; } } }; -fastfloat_really_inline +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 uint64_t empty_hi64(bool& truncated) noexcept { truncated = false; return 0; } -fastfloat_really_inline +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t uint64_hi64(uint64_t r0, bool& truncated) noexcept { truncated = false; int shl = leading_zeroes(r0); return r0 << shl; } -fastfloat_really_inline +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t uint64_hi64(uint64_t r0, uint64_t r1, bool& truncated) noexcept { int shl = leading_zeroes(r0); if (shl == 0) { @@ -181,19 +182,19 @@ uint64_t uint64_hi64(uint64_t r0, uint64_t r1, bool& truncated) noexcept { } } -fastfloat_really_inline +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t uint32_hi64(uint32_t r0, bool& truncated) noexcept { return uint64_hi64(r0, truncated); } -fastfloat_really_inline +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t uint32_hi64(uint32_t r0, uint32_t r1, bool& truncated) noexcept { uint64_t x0 = r0; uint64_t x1 = r1; return uint64_hi64((x0 << 32) | x1, truncated); } -fastfloat_really_inline +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t uint32_hi64(uint32_t r0, uint32_t r1, uint32_t r2, bool& truncated) noexcept { uint64_t x0 = r0; uint64_t x1 = r1; @@ -205,15 +206,16 @@ uint64_t uint32_hi64(uint32_t r0, uint32_t r1, uint32_t r2, bool& truncated) noe // we want an efficient operation. for msvc, where // we don't have built-in intrinsics, this is still // pretty fast. -fastfloat_really_inline +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 limb scalar_add(limb x, limb y, bool& overflow) noexcept { limb z; - // gcc and clang #if defined(__has_builtin) #if __has_builtin(__builtin_add_overflow) - overflow = __builtin_add_overflow(x, y, &z); - return z; + if (!cpp20_and_in_constexpr()) { + overflow = __builtin_add_overflow(x, y, &z); + return z; + } #endif #endif @@ -224,7 +226,7 @@ limb scalar_add(limb x, limb y, bool& overflow) noexcept { } // multiply two small integers, getting both the high and low bits. -fastfloat_really_inline +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 limb scalar_mul(limb x, limb y, limb& carry) noexcept { #ifdef FASTFLOAT_64BIT_LIMB #if defined(__SIZEOF_INT128__) @@ -252,7 +254,8 @@ limb scalar_mul(limb x, limb y, limb& carry) noexcept { // add scalar value to bigint starting from offset. // used in grade school multiplication template -inline bool small_add_from(stackvec& vec, limb y, size_t start) noexcept { +inline FASTFLOAT_CONSTEXPR20 +bool small_add_from(stackvec& vec, limb y, size_t start) noexcept { size_t index = start; limb carry = y; bool overflow; @@ -269,13 +272,15 @@ inline bool small_add_from(stackvec& vec, limb y, size_t start) noexcept { // add scalar value to bigint. template -fastfloat_really_inline bool small_add(stackvec& vec, limb y) noexcept { +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 +bool small_add(stackvec& vec, limb y) noexcept { return small_add_from(vec, y, 0); } // multiply bigint by scalar value. template -inline bool small_mul(stackvec& vec, limb y) noexcept { +inline FASTFLOAT_CONSTEXPR20 +bool small_mul(stackvec& vec, limb y) noexcept { limb carry = 0; for (size_t index = 0; index < vec.len(); index++) { vec[index] = scalar_mul(vec[index], y, carry); @@ -289,6 +294,7 @@ inline bool small_mul(stackvec& vec, limb y) noexcept { // add bigint to bigint starting from index. // used in grade school multiplication template +FASTFLOAT_CONSTEXPR20 bool large_add_from(stackvec& x, limb_span y, size_t start) noexcept { // the effective x buffer is from `xstart..x.len()`, so exit early // if we can't get that current range. @@ -319,12 +325,14 @@ bool large_add_from(stackvec& x, limb_span y, size_t start) noexcept { // add bigint to bigint. template -fastfloat_really_inline bool large_add_from(stackvec& x, limb_span y) noexcept { +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 +bool large_add_from(stackvec& x, limb_span y) noexcept { return large_add_from(x, y, 0); } // grade-school multiplication algorithm template +FASTFLOAT_CONSTEXPR20 bool long_mul(stackvec& x, limb_span y) noexcept { limb_span xs = limb_span(x.data, x.len()); stackvec z(xs); @@ -353,6 +361,7 @@ bool long_mul(stackvec& x, limb_span y) noexcept { // grade-school multiplication algorithm template +FASTFLOAT_CONSTEXPR20 bool large_mul(stackvec& x, limb_span y) noexcept { if (y.len() == 1) { FASTFLOAT_TRY(small_mul(x, y[0])); @@ -362,21 +371,52 @@ bool large_mul(stackvec& x, limb_span y) noexcept { return true; } +template +struct pow5_tables { + static constexpr uint32_t large_step = 135; + static constexpr uint64_t small_power_of_5[] = { + 1UL, 5UL, 25UL, 125UL, 625UL, 3125UL, 15625UL, 78125UL, 390625UL, + 1953125UL, 9765625UL, 48828125UL, 244140625UL, 1220703125UL, + 6103515625UL, 30517578125UL, 152587890625UL, 762939453125UL, + 3814697265625UL, 19073486328125UL, 95367431640625UL, 476837158203125UL, + 2384185791015625UL, 11920928955078125UL, 59604644775390625UL, + 298023223876953125UL, 1490116119384765625UL, 7450580596923828125UL, + }; +#ifdef FASTFLOAT_64BIT_LIMB + constexpr static limb large_power_of_5[] = { + 1414648277510068013UL, 9180637584431281687UL, 4539964771860779200UL, + 10482974169319127550UL, 198276706040285095UL}; +#else + constexpr static limb large_power_of_5[] = { + 4279965485U, 329373468U, 4020270615U, 2137533757U, 4287402176U, + 1057042919U, 1071430142U, 2440757623U, 381945767U, 46164893U}; +#endif +}; + +template +constexpr uint32_t pow5_tables::large_step; + +template +constexpr uint64_t pow5_tables::small_power_of_5[]; + +template +constexpr limb pow5_tables::large_power_of_5[]; + // big integer type. implements a small subset of big integer // arithmetic, using simple algorithms since asymptotically // faster algorithms are slower for a small number of limbs. // all operations assume the big-integer is normalized. -struct bigint { +struct bigint : pow5_tables<> { // storage of the limbs, in little-endian order. stackvec vec; - bigint(): vec() {} + FASTFLOAT_CONSTEXPR20 bigint(): vec() {} bigint(const bigint &) = delete; bigint &operator=(const bigint &) = delete; bigint(bigint &&) = delete; bigint &operator=(bigint &&other) = delete; - bigint(uint64_t value): vec() { + FASTFLOAT_CONSTEXPR20 bigint(uint64_t value): vec() { #ifdef FASTFLOAT_64BIT_LIMB vec.push_unchecked(value); #else @@ -388,7 +428,7 @@ struct bigint { // get the high 64 bits from the vector, and if bits were truncated. // this is to get the significant digits for the float. - uint64_t hi64(bool& truncated) const noexcept { + FASTFLOAT_CONSTEXPR20 uint64_t hi64(bool& truncated) const noexcept { #ifdef FASTFLOAT_64BIT_LIMB if (vec.len() == 0) { return empty_hi64(truncated); @@ -420,7 +460,7 @@ struct bigint { // positive, this is larger, otherwise they are equal. // the limbs are stored in little-endian order, so we // must compare the limbs in ever order. - int compare(const bigint& other) const noexcept { + FASTFLOAT_CONSTEXPR20 int compare(const bigint& other) const noexcept { if (vec.len() > other.vec.len()) { return 1; } else if (vec.len() < other.vec.len()) { @@ -441,7 +481,7 @@ struct bigint { // shift left each limb n bits, carrying over to the new limb // returns true if we were able to shift all the digits. - bool shl_bits(size_t n) noexcept { + FASTFLOAT_CONSTEXPR20 bool shl_bits(size_t n) noexcept { // Internally, for each item, we shift left by n, and add the previous // right shifted limb-bits. // For example, we transform (for u8) shifted left 2, to: @@ -467,7 +507,7 @@ struct bigint { } // move the limbs left by `n` limbs. - bool shl_limbs(size_t n) noexcept { + FASTFLOAT_CONSTEXPR20 bool shl_limbs(size_t n) noexcept { FASTFLOAT_DEBUG_ASSERT(n != 0); if (n + vec.len() > vec.capacity()) { return false; @@ -488,7 +528,7 @@ struct bigint { } // move the limbs left by `n` bits. - bool shl(size_t n) noexcept { + FASTFLOAT_CONSTEXPR20 bool shl(size_t n) noexcept { size_t rem = n % limb_bits; size_t div = n / limb_bits; if (rem != 0) { @@ -501,7 +541,7 @@ struct bigint { } // get the number of leading zeros in the bigint. - int ctlz() const noexcept { + FASTFLOAT_CONSTEXPR20 int ctlz() const noexcept { if (vec.is_empty()) { return 0; } else { @@ -516,45 +556,27 @@ struct bigint { } // get the number of bits in the bigint. - int bit_length() const noexcept { + FASTFLOAT_CONSTEXPR20 int bit_length() const noexcept { int lz = ctlz(); return int(limb_bits * vec.len()) - lz; } - bool mul(limb y) noexcept { + FASTFLOAT_CONSTEXPR20 bool mul(limb y) noexcept { return small_mul(vec, y); } - bool add(limb y) noexcept { + FASTFLOAT_CONSTEXPR20 bool add(limb y) noexcept { return small_add(vec, y); } // multiply as if by 2 raised to a power. - bool pow2(uint32_t exp) noexcept { + FASTFLOAT_CONSTEXPR20 bool pow2(uint32_t exp) noexcept { return shl(exp); } // multiply as if by 5 raised to a power. - bool pow5(uint32_t exp) noexcept { + FASTFLOAT_CONSTEXPR20 bool pow5(uint32_t exp) noexcept { // multiply by a power of 5 - static constexpr uint32_t large_step = 135; - static constexpr uint64_t small_power_of_5[] = { - 1UL, 5UL, 25UL, 125UL, 625UL, 3125UL, 15625UL, 78125UL, 390625UL, - 1953125UL, 9765625UL, 48828125UL, 244140625UL, 1220703125UL, - 6103515625UL, 30517578125UL, 152587890625UL, 762939453125UL, - 3814697265625UL, 19073486328125UL, 95367431640625UL, 476837158203125UL, - 2384185791015625UL, 11920928955078125UL, 59604644775390625UL, - 298023223876953125UL, 1490116119384765625UL, 7450580596923828125UL, - }; -#ifdef FASTFLOAT_64BIT_LIMB - constexpr static limb large_power_of_5[] = { - 1414648277510068013UL, 9180637584431281687UL, 4539964771860779200UL, - 10482974169319127550UL, 198276706040285095UL}; -#else - constexpr static limb large_power_of_5[] = { - 4279965485U, 329373468U, 4020270615U, 2137533757U, 4287402176U, - 1057042919U, 1071430142U, 2440757623U, 381945767U, 46164893U}; -#endif size_t large_length = sizeof(large_power_of_5) / sizeof(limb); limb_span large = limb_span(large_power_of_5, large_length); while (exp >= large_step) { @@ -580,7 +602,7 @@ struct bigint { } // multiply as if by 10 raised to a power. - bool pow10(uint32_t exp) noexcept { + FASTFLOAT_CONSTEXPR20 bool pow10(uint32_t exp) noexcept { FASTFLOAT_TRY(pow5(exp)); return pow2(exp); } diff --git a/cpp/src/arrow/vendored/fast_float/constexpr_feature_detect.h b/cpp/src/arrow/vendored/fast_float/constexpr_feature_detect.h new file mode 100644 index 0000000000000..ba8b65c64a160 --- /dev/null +++ b/cpp/src/arrow/vendored/fast_float/constexpr_feature_detect.h @@ -0,0 +1,40 @@ +#ifndef FASTFLOAT_CONSTEXPR_FEATURE_DETECT_H +#define FASTFLOAT_CONSTEXPR_FEATURE_DETECT_H + +#ifdef __has_include +#if __has_include() +#include +#endif +#endif + +// Testing for https://wg21.link/N3652, adopted in C++14 +#if __cpp_constexpr >= 201304 +#define FASTFLOAT_CONSTEXPR14 constexpr +#else +#define FASTFLOAT_CONSTEXPR14 +#endif + +#if defined(__cpp_lib_bit_cast) && __cpp_lib_bit_cast >= 201806L +#define FASTFLOAT_HAS_BIT_CAST 1 +#else +#define FASTFLOAT_HAS_BIT_CAST 0 +#endif + +#if defined(__cpp_lib_is_constant_evaluated) && __cpp_lib_is_constant_evaluated >= 201811L +#define FASTFLOAT_HAS_IS_CONSTANT_EVALUATED 1 +#else +#define FASTFLOAT_HAS_IS_CONSTANT_EVALUATED 0 +#endif + +// Testing for relevant C++20 constexpr library features +#if FASTFLOAT_HAS_IS_CONSTANT_EVALUATED \ + && FASTFLOAT_HAS_BIT_CAST \ + && __cpp_lib_constexpr_algorithms >= 201806L /*For std::copy and std::fill*/ +#define FASTFLOAT_CONSTEXPR20 constexpr +#define FASTFLOAT_IS_CONSTEXPR 1 +#else +#define FASTFLOAT_CONSTEXPR20 +#define FASTFLOAT_IS_CONSTEXPR 0 +#endif + +#endif // FASTFLOAT_CONSTEXPR_FEATURE_DETECT_H diff --git a/cpp/src/arrow/vendored/fast_float/decimal_to_binary.h b/cpp/src/arrow/vendored/fast_float/decimal_to_binary.h index 8ae481d323865..9390228c3946a 100644 --- a/cpp/src/arrow/vendored/fast_float/decimal_to_binary.h +++ b/cpp/src/arrow/vendored/fast_float/decimal_to_binary.h @@ -64,7 +64,7 @@ namespace detail { // create an adjusted mantissa, biased by the invalid power2 // for significant digits already multiplied by 10 ** q. template -fastfloat_really_inline +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 adjusted_mantissa compute_error_scaled(int64_t q, uint64_t w, int lz) noexcept { int hilz = int(w >> 63) ^ 1; adjusted_mantissa answer; @@ -118,16 +118,11 @@ adjusted_mantissa compute_float(int64_t q, uint64_t w) noexcept { // 3. We might lose a bit due to the "upperbit" routine (result too small, requiring a shift) value128 product = compute_product_approximation(q, w); - if(product.low == 0xFFFFFFFFFFFFFFFF) { // could guard it further - // In some very rare cases, this could happen, in which case we might need a more accurate - // computation that what we can provide cheaply. This is very, very unlikely. - // - const bool inside_safe_exponent = (q >= -27) && (q <= 55); // always good because 5**q <2**128 when q>=0, - // and otherwise, for q<0, we have 5**-q<2**64 and the 128-bit reciprocal allows for exact computation. - if(!inside_safe_exponent) { - return compute_error_scaled(q, product.high, lz); - } - } + // The computed 'product' is always sufficient. + // Mathematical proof: + // Noble Mushtak and Daniel Lemire, Fast Number Parsing Without Fallback (to appear) + // See script/mushtak_lemire.py + // The "compute_product_approximation" function can be slightly slower than a branchless approach: // value128 product = compute_product(q, w); // but in practice, we can win big with the compute_product_approximation if its additional branch diff --git a/cpp/src/arrow/vendored/fast_float/digit_comparison.h b/cpp/src/arrow/vendored/fast_float/digit_comparison.h index 5cb01a93648fd..b27348a1fcc7e 100644 --- a/cpp/src/arrow/vendored/fast_float/digit_comparison.h +++ b/cpp/src/arrow/vendored/fast_float/digit_comparison.h @@ -24,7 +24,8 @@ constexpr static uint64_t powers_of_ten_uint64[] = { // this algorithm is not even close to optimized, but it has no practical // effect on performance: in order to have a faster algorithm, we'd need // to slow down performance for faster algorithms, and this is still fast. -fastfloat_really_inline int32_t scientific_exponent(parsed_number_string& num) noexcept { +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 +int32_t scientific_exponent(parsed_number_string& num) noexcept { uint64_t mantissa = num.mantissa; int32_t exponent = int32_t(num.exponent); while (mantissa >= 10000) { @@ -82,7 +83,8 @@ fastfloat_really_inline adjusted_mantissa to_extended_halfway(T value) noexcept // round an extended-precision float to the nearest machine float. template -fastfloat_really_inline void round(adjusted_mantissa& am, callback cb) noexcept { +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 +void round(adjusted_mantissa& am, callback cb) noexcept { int32_t mantissa_shift = 64 - binary_format::mantissa_explicit_bits() - 1; if (-am.power2 >= mantissa_shift) { // have a denormal float @@ -111,23 +113,19 @@ fastfloat_really_inline void round(adjusted_mantissa& am, callback cb) noexcept } template -fastfloat_really_inline +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void round_nearest_tie_even(adjusted_mantissa& am, int32_t shift, callback cb) noexcept { - uint64_t mask; - uint64_t halfway; - if (shift == 64) { - mask = UINT64_MAX; - } else { - mask = (uint64_t(1) << shift) - 1; - } - if (shift == 0) { - halfway = 0; - } else { - halfway = uint64_t(1) << (shift - 1); - } + const uint64_t mask + = (shift == 64) + ? UINT64_MAX + : (uint64_t(1) << shift) - 1; + const uint64_t halfway + = (shift == 0) + ? 0 + : uint64_t(1) << (shift - 1); uint64_t truncated_bits = am.mantissa & mask; - uint64_t is_above = truncated_bits > halfway; - uint64_t is_halfway = truncated_bits == halfway; + bool is_above = truncated_bits > halfway; + bool is_halfway = truncated_bits == halfway; // shift digits into position if (shift == 64) { @@ -141,7 +139,8 @@ void round_nearest_tie_even(adjusted_mantissa& am, int32_t shift, callback cb) n am.mantissa += uint64_t(cb(is_odd, is_halfway, is_above)); } -fastfloat_really_inline void round_down(adjusted_mantissa& am, int32_t shift) noexcept { +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 +void round_down(adjusted_mantissa& am, int32_t shift) noexcept { if (shift == 64) { am.mantissa = 0; } else { @@ -200,7 +199,7 @@ void parse_eight_digits(const char*& p, limb& value, size_t& counter, size_t& co count += 8; } -fastfloat_really_inline +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void parse_one_digit(const char*& p, limb& value, size_t& counter, size_t& count) noexcept { value = value * 10 + limb(*p - '0'); p++; diff --git a/cpp/src/arrow/vendored/fast_float/fast_float.h b/cpp/src/arrow/vendored/fast_float/fast_float.h index b379efdd9e409..7942fe35ad1d3 100644 --- a/cpp/src/arrow/vendored/fast_float/fast_float.h +++ b/cpp/src/arrow/vendored/fast_float/fast_float.h @@ -59,7 +59,7 @@ template from_chars_result from_chars_advanced(const char *first, const char *last, T &value, parse_options options) noexcept; -} +} // namespace fast_float } // namespace arrow_vendored #include "parse_number.h" #endif // FASTFLOAT_FAST_FLOAT_H diff --git a/cpp/src/arrow/vendored/fast_float/fast_table.h b/cpp/src/arrow/vendored/fast_float/fast_table.h index 4861cab3a0d84..2c637d85c8fb0 100644 --- a/cpp/src/arrow/vendored/fast_float/fast_table.h +++ b/cpp/src/arrow/vendored/fast_float/fast_table.h @@ -18,11 +18,11 @@ namespace fast_float { */ /** - * The smallest non-zero float (binary64) is 2^−1074. + * The smallest non-zero float (binary64) is 2^-1074. * We take as input numbers of the form w x 10^q where w < 2^64. * We have that w * 10^-343 < 2^(64-344) 5^-343 < 2^-1076. * However, we have that - * (2^64-1) * 10^-342 = (2^64-1) * 2^-342 * 5^-342 > 2^−1074. + * (2^64-1) * 10^-342 = (2^64-1) * 2^-342 * 5^-342 > 2^-1074. * Thus it is possible for a number of the form w * 10^-342 where * w is a 64-bit value to be a non-zero floating-point number. ********* @@ -37,665 +37,666 @@ constexpr static int smallest_power_of_five = binary_format::smallest_po constexpr static int largest_power_of_five = binary_format::largest_power_of_ten(); constexpr static int number_of_entries = 2 * (largest_power_of_five - smallest_power_of_five + 1); // Powers of five from 5^-342 all the way to 5^308 rounded toward one. -static const uint64_t power_of_five_128[number_of_entries]; +constexpr static uint64_t power_of_five_128[number_of_entries] = { + 0xeef453d6923bd65a,0x113faa2906a13b3f, + 0x9558b4661b6565f8,0x4ac7ca59a424c507, + 0xbaaee17fa23ebf76,0x5d79bcf00d2df649, + 0xe95a99df8ace6f53,0xf4d82c2c107973dc, + 0x91d8a02bb6c10594,0x79071b9b8a4be869, + 0xb64ec836a47146f9,0x9748e2826cdee284, + 0xe3e27a444d8d98b7,0xfd1b1b2308169b25, + 0x8e6d8c6ab0787f72,0xfe30f0f5e50e20f7, + 0xb208ef855c969f4f,0xbdbd2d335e51a935, + 0xde8b2b66b3bc4723,0xad2c788035e61382, + 0x8b16fb203055ac76,0x4c3bcb5021afcc31, + 0xaddcb9e83c6b1793,0xdf4abe242a1bbf3d, + 0xd953e8624b85dd78,0xd71d6dad34a2af0d, + 0x87d4713d6f33aa6b,0x8672648c40e5ad68, + 0xa9c98d8ccb009506,0x680efdaf511f18c2, + 0xd43bf0effdc0ba48,0x212bd1b2566def2, + 0x84a57695fe98746d,0x14bb630f7604b57, + 0xa5ced43b7e3e9188,0x419ea3bd35385e2d, + 0xcf42894a5dce35ea,0x52064cac828675b9, + 0x818995ce7aa0e1b2,0x7343efebd1940993, + 0xa1ebfb4219491a1f,0x1014ebe6c5f90bf8, + 0xca66fa129f9b60a6,0xd41a26e077774ef6, + 0xfd00b897478238d0,0x8920b098955522b4, + 0x9e20735e8cb16382,0x55b46e5f5d5535b0, + 0xc5a890362fddbc62,0xeb2189f734aa831d, + 0xf712b443bbd52b7b,0xa5e9ec7501d523e4, + 0x9a6bb0aa55653b2d,0x47b233c92125366e, + 0xc1069cd4eabe89f8,0x999ec0bb696e840a, + 0xf148440a256e2c76,0xc00670ea43ca250d, + 0x96cd2a865764dbca,0x380406926a5e5728, + 0xbc807527ed3e12bc,0xc605083704f5ecf2, + 0xeba09271e88d976b,0xf7864a44c633682e, + 0x93445b8731587ea3,0x7ab3ee6afbe0211d, + 0xb8157268fdae9e4c,0x5960ea05bad82964, + 0xe61acf033d1a45df,0x6fb92487298e33bd, + 0x8fd0c16206306bab,0xa5d3b6d479f8e056, + 0xb3c4f1ba87bc8696,0x8f48a4899877186c, + 0xe0b62e2929aba83c,0x331acdabfe94de87, + 0x8c71dcd9ba0b4925,0x9ff0c08b7f1d0b14, + 0xaf8e5410288e1b6f,0x7ecf0ae5ee44dd9, + 0xdb71e91432b1a24a,0xc9e82cd9f69d6150, + 0x892731ac9faf056e,0xbe311c083a225cd2, + 0xab70fe17c79ac6ca,0x6dbd630a48aaf406, + 0xd64d3d9db981787d,0x92cbbccdad5b108, + 0x85f0468293f0eb4e,0x25bbf56008c58ea5, + 0xa76c582338ed2621,0xaf2af2b80af6f24e, + 0xd1476e2c07286faa,0x1af5af660db4aee1, + 0x82cca4db847945ca,0x50d98d9fc890ed4d, + 0xa37fce126597973c,0xe50ff107bab528a0, + 0xcc5fc196fefd7d0c,0x1e53ed49a96272c8, + 0xff77b1fcbebcdc4f,0x25e8e89c13bb0f7a, + 0x9faacf3df73609b1,0x77b191618c54e9ac, + 0xc795830d75038c1d,0xd59df5b9ef6a2417, + 0xf97ae3d0d2446f25,0x4b0573286b44ad1d, + 0x9becce62836ac577,0x4ee367f9430aec32, + 0xc2e801fb244576d5,0x229c41f793cda73f, + 0xf3a20279ed56d48a,0x6b43527578c1110f, + 0x9845418c345644d6,0x830a13896b78aaa9, + 0xbe5691ef416bd60c,0x23cc986bc656d553, + 0xedec366b11c6cb8f,0x2cbfbe86b7ec8aa8, + 0x94b3a202eb1c3f39,0x7bf7d71432f3d6a9, + 0xb9e08a83a5e34f07,0xdaf5ccd93fb0cc53, + 0xe858ad248f5c22c9,0xd1b3400f8f9cff68, + 0x91376c36d99995be,0x23100809b9c21fa1, + 0xb58547448ffffb2d,0xabd40a0c2832a78a, + 0xe2e69915b3fff9f9,0x16c90c8f323f516c, + 0x8dd01fad907ffc3b,0xae3da7d97f6792e3, + 0xb1442798f49ffb4a,0x99cd11cfdf41779c, + 0xdd95317f31c7fa1d,0x40405643d711d583, + 0x8a7d3eef7f1cfc52,0x482835ea666b2572, + 0xad1c8eab5ee43b66,0xda3243650005eecf, + 0xd863b256369d4a40,0x90bed43e40076a82, + 0x873e4f75e2224e68,0x5a7744a6e804a291, + 0xa90de3535aaae202,0x711515d0a205cb36, + 0xd3515c2831559a83,0xd5a5b44ca873e03, + 0x8412d9991ed58091,0xe858790afe9486c2, + 0xa5178fff668ae0b6,0x626e974dbe39a872, + 0xce5d73ff402d98e3,0xfb0a3d212dc8128f, + 0x80fa687f881c7f8e,0x7ce66634bc9d0b99, + 0xa139029f6a239f72,0x1c1fffc1ebc44e80, + 0xc987434744ac874e,0xa327ffb266b56220, + 0xfbe9141915d7a922,0x4bf1ff9f0062baa8, + 0x9d71ac8fada6c9b5,0x6f773fc3603db4a9, + 0xc4ce17b399107c22,0xcb550fb4384d21d3, + 0xf6019da07f549b2b,0x7e2a53a146606a48, + 0x99c102844f94e0fb,0x2eda7444cbfc426d, + 0xc0314325637a1939,0xfa911155fefb5308, + 0xf03d93eebc589f88,0x793555ab7eba27ca, + 0x96267c7535b763b5,0x4bc1558b2f3458de, + 0xbbb01b9283253ca2,0x9eb1aaedfb016f16, + 0xea9c227723ee8bcb,0x465e15a979c1cadc, + 0x92a1958a7675175f,0xbfacd89ec191ec9, + 0xb749faed14125d36,0xcef980ec671f667b, + 0xe51c79a85916f484,0x82b7e12780e7401a, + 0x8f31cc0937ae58d2,0xd1b2ecb8b0908810, + 0xb2fe3f0b8599ef07,0x861fa7e6dcb4aa15, + 0xdfbdcece67006ac9,0x67a791e093e1d49a, + 0x8bd6a141006042bd,0xe0c8bb2c5c6d24e0, + 0xaecc49914078536d,0x58fae9f773886e18, + 0xda7f5bf590966848,0xaf39a475506a899e, + 0x888f99797a5e012d,0x6d8406c952429603, + 0xaab37fd7d8f58178,0xc8e5087ba6d33b83, + 0xd5605fcdcf32e1d6,0xfb1e4a9a90880a64, + 0x855c3be0a17fcd26,0x5cf2eea09a55067f, + 0xa6b34ad8c9dfc06f,0xf42faa48c0ea481e, + 0xd0601d8efc57b08b,0xf13b94daf124da26, + 0x823c12795db6ce57,0x76c53d08d6b70858, + 0xa2cb1717b52481ed,0x54768c4b0c64ca6e, + 0xcb7ddcdda26da268,0xa9942f5dcf7dfd09, + 0xfe5d54150b090b02,0xd3f93b35435d7c4c, + 0x9efa548d26e5a6e1,0xc47bc5014a1a6daf, + 0xc6b8e9b0709f109a,0x359ab6419ca1091b, + 0xf867241c8cc6d4c0,0xc30163d203c94b62, + 0x9b407691d7fc44f8,0x79e0de63425dcf1d, + 0xc21094364dfb5636,0x985915fc12f542e4, + 0xf294b943e17a2bc4,0x3e6f5b7b17b2939d, + 0x979cf3ca6cec5b5a,0xa705992ceecf9c42, + 0xbd8430bd08277231,0x50c6ff782a838353, + 0xece53cec4a314ebd,0xa4f8bf5635246428, + 0x940f4613ae5ed136,0x871b7795e136be99, + 0xb913179899f68584,0x28e2557b59846e3f, + 0xe757dd7ec07426e5,0x331aeada2fe589cf, + 0x9096ea6f3848984f,0x3ff0d2c85def7621, + 0xb4bca50b065abe63,0xfed077a756b53a9, + 0xe1ebce4dc7f16dfb,0xd3e8495912c62894, + 0x8d3360f09cf6e4bd,0x64712dd7abbbd95c, + 0xb080392cc4349dec,0xbd8d794d96aacfb3, + 0xdca04777f541c567,0xecf0d7a0fc5583a0, + 0x89e42caaf9491b60,0xf41686c49db57244, + 0xac5d37d5b79b6239,0x311c2875c522ced5, + 0xd77485cb25823ac7,0x7d633293366b828b, + 0x86a8d39ef77164bc,0xae5dff9c02033197, + 0xa8530886b54dbdeb,0xd9f57f830283fdfc, + 0xd267caa862a12d66,0xd072df63c324fd7b, + 0x8380dea93da4bc60,0x4247cb9e59f71e6d, + 0xa46116538d0deb78,0x52d9be85f074e608, + 0xcd795be870516656,0x67902e276c921f8b, + 0x806bd9714632dff6,0xba1cd8a3db53b6, + 0xa086cfcd97bf97f3,0x80e8a40eccd228a4, + 0xc8a883c0fdaf7df0,0x6122cd128006b2cd, + 0xfad2a4b13d1b5d6c,0x796b805720085f81, + 0x9cc3a6eec6311a63,0xcbe3303674053bb0, + 0xc3f490aa77bd60fc,0xbedbfc4411068a9c, + 0xf4f1b4d515acb93b,0xee92fb5515482d44, + 0x991711052d8bf3c5,0x751bdd152d4d1c4a, + 0xbf5cd54678eef0b6,0xd262d45a78a0635d, + 0xef340a98172aace4,0x86fb897116c87c34, + 0x9580869f0e7aac0e,0xd45d35e6ae3d4da0, + 0xbae0a846d2195712,0x8974836059cca109, + 0xe998d258869facd7,0x2bd1a438703fc94b, + 0x91ff83775423cc06,0x7b6306a34627ddcf, + 0xb67f6455292cbf08,0x1a3bc84c17b1d542, + 0xe41f3d6a7377eeca,0x20caba5f1d9e4a93, + 0x8e938662882af53e,0x547eb47b7282ee9c, + 0xb23867fb2a35b28d,0xe99e619a4f23aa43, + 0xdec681f9f4c31f31,0x6405fa00e2ec94d4, + 0x8b3c113c38f9f37e,0xde83bc408dd3dd04, + 0xae0b158b4738705e,0x9624ab50b148d445, + 0xd98ddaee19068c76,0x3badd624dd9b0957, + 0x87f8a8d4cfa417c9,0xe54ca5d70a80e5d6, + 0xa9f6d30a038d1dbc,0x5e9fcf4ccd211f4c, + 0xd47487cc8470652b,0x7647c3200069671f, + 0x84c8d4dfd2c63f3b,0x29ecd9f40041e073, + 0xa5fb0a17c777cf09,0xf468107100525890, + 0xcf79cc9db955c2cc,0x7182148d4066eeb4, + 0x81ac1fe293d599bf,0xc6f14cd848405530, + 0xa21727db38cb002f,0xb8ada00e5a506a7c, + 0xca9cf1d206fdc03b,0xa6d90811f0e4851c, + 0xfd442e4688bd304a,0x908f4a166d1da663, + 0x9e4a9cec15763e2e,0x9a598e4e043287fe, + 0xc5dd44271ad3cdba,0x40eff1e1853f29fd, + 0xf7549530e188c128,0xd12bee59e68ef47c, + 0x9a94dd3e8cf578b9,0x82bb74f8301958ce, + 0xc13a148e3032d6e7,0xe36a52363c1faf01, + 0xf18899b1bc3f8ca1,0xdc44e6c3cb279ac1, + 0x96f5600f15a7b7e5,0x29ab103a5ef8c0b9, + 0xbcb2b812db11a5de,0x7415d448f6b6f0e7, + 0xebdf661791d60f56,0x111b495b3464ad21, + 0x936b9fcebb25c995,0xcab10dd900beec34, + 0xb84687c269ef3bfb,0x3d5d514f40eea742, + 0xe65829b3046b0afa,0xcb4a5a3112a5112, + 0x8ff71a0fe2c2e6dc,0x47f0e785eaba72ab, + 0xb3f4e093db73a093,0x59ed216765690f56, + 0xe0f218b8d25088b8,0x306869c13ec3532c, + 0x8c974f7383725573,0x1e414218c73a13fb, + 0xafbd2350644eeacf,0xe5d1929ef90898fa, + 0xdbac6c247d62a583,0xdf45f746b74abf39, + 0x894bc396ce5da772,0x6b8bba8c328eb783, + 0xab9eb47c81f5114f,0x66ea92f3f326564, + 0xd686619ba27255a2,0xc80a537b0efefebd, + 0x8613fd0145877585,0xbd06742ce95f5f36, + 0xa798fc4196e952e7,0x2c48113823b73704, + 0xd17f3b51fca3a7a0,0xf75a15862ca504c5, + 0x82ef85133de648c4,0x9a984d73dbe722fb, + 0xa3ab66580d5fdaf5,0xc13e60d0d2e0ebba, + 0xcc963fee10b7d1b3,0x318df905079926a8, + 0xffbbcfe994e5c61f,0xfdf17746497f7052, + 0x9fd561f1fd0f9bd3,0xfeb6ea8bedefa633, + 0xc7caba6e7c5382c8,0xfe64a52ee96b8fc0, + 0xf9bd690a1b68637b,0x3dfdce7aa3c673b0, + 0x9c1661a651213e2d,0x6bea10ca65c084e, + 0xc31bfa0fe5698db8,0x486e494fcff30a62, + 0xf3e2f893dec3f126,0x5a89dba3c3efccfa, + 0x986ddb5c6b3a76b7,0xf89629465a75e01c, + 0xbe89523386091465,0xf6bbb397f1135823, + 0xee2ba6c0678b597f,0x746aa07ded582e2c, + 0x94db483840b717ef,0xa8c2a44eb4571cdc, + 0xba121a4650e4ddeb,0x92f34d62616ce413, + 0xe896a0d7e51e1566,0x77b020baf9c81d17, + 0x915e2486ef32cd60,0xace1474dc1d122e, + 0xb5b5ada8aaff80b8,0xd819992132456ba, + 0xe3231912d5bf60e6,0x10e1fff697ed6c69, + 0x8df5efabc5979c8f,0xca8d3ffa1ef463c1, + 0xb1736b96b6fd83b3,0xbd308ff8a6b17cb2, + 0xddd0467c64bce4a0,0xac7cb3f6d05ddbde, + 0x8aa22c0dbef60ee4,0x6bcdf07a423aa96b, + 0xad4ab7112eb3929d,0x86c16c98d2c953c6, + 0xd89d64d57a607744,0xe871c7bf077ba8b7, + 0x87625f056c7c4a8b,0x11471cd764ad4972, + 0xa93af6c6c79b5d2d,0xd598e40d3dd89bcf, + 0xd389b47879823479,0x4aff1d108d4ec2c3, + 0x843610cb4bf160cb,0xcedf722a585139ba, + 0xa54394fe1eedb8fe,0xc2974eb4ee658828, + 0xce947a3da6a9273e,0x733d226229feea32, + 0x811ccc668829b887,0x806357d5a3f525f, + 0xa163ff802a3426a8,0xca07c2dcb0cf26f7, + 0xc9bcff6034c13052,0xfc89b393dd02f0b5, + 0xfc2c3f3841f17c67,0xbbac2078d443ace2, + 0x9d9ba7832936edc0,0xd54b944b84aa4c0d, + 0xc5029163f384a931,0xa9e795e65d4df11, + 0xf64335bcf065d37d,0x4d4617b5ff4a16d5, + 0x99ea0196163fa42e,0x504bced1bf8e4e45, + 0xc06481fb9bcf8d39,0xe45ec2862f71e1d6, + 0xf07da27a82c37088,0x5d767327bb4e5a4c, + 0x964e858c91ba2655,0x3a6a07f8d510f86f, + 0xbbe226efb628afea,0x890489f70a55368b, + 0xeadab0aba3b2dbe5,0x2b45ac74ccea842e, + 0x92c8ae6b464fc96f,0x3b0b8bc90012929d, + 0xb77ada0617e3bbcb,0x9ce6ebb40173744, + 0xe55990879ddcaabd,0xcc420a6a101d0515, + 0x8f57fa54c2a9eab6,0x9fa946824a12232d, + 0xb32df8e9f3546564,0x47939822dc96abf9, + 0xdff9772470297ebd,0x59787e2b93bc56f7, + 0x8bfbea76c619ef36,0x57eb4edb3c55b65a, + 0xaefae51477a06b03,0xede622920b6b23f1, + 0xdab99e59958885c4,0xe95fab368e45eced, + 0x88b402f7fd75539b,0x11dbcb0218ebb414, + 0xaae103b5fcd2a881,0xd652bdc29f26a119, + 0xd59944a37c0752a2,0x4be76d3346f0495f, + 0x857fcae62d8493a5,0x6f70a4400c562ddb, + 0xa6dfbd9fb8e5b88e,0xcb4ccd500f6bb952, + 0xd097ad07a71f26b2,0x7e2000a41346a7a7, + 0x825ecc24c873782f,0x8ed400668c0c28c8, + 0xa2f67f2dfa90563b,0x728900802f0f32fa, + 0xcbb41ef979346bca,0x4f2b40a03ad2ffb9, + 0xfea126b7d78186bc,0xe2f610c84987bfa8, + 0x9f24b832e6b0f436,0xdd9ca7d2df4d7c9, + 0xc6ede63fa05d3143,0x91503d1c79720dbb, + 0xf8a95fcf88747d94,0x75a44c6397ce912a, + 0x9b69dbe1b548ce7c,0xc986afbe3ee11aba, + 0xc24452da229b021b,0xfbe85badce996168, + 0xf2d56790ab41c2a2,0xfae27299423fb9c3, + 0x97c560ba6b0919a5,0xdccd879fc967d41a, + 0xbdb6b8e905cb600f,0x5400e987bbc1c920, + 0xed246723473e3813,0x290123e9aab23b68, + 0x9436c0760c86e30b,0xf9a0b6720aaf6521, + 0xb94470938fa89bce,0xf808e40e8d5b3e69, + 0xe7958cb87392c2c2,0xb60b1d1230b20e04, + 0x90bd77f3483bb9b9,0xb1c6f22b5e6f48c2, + 0xb4ecd5f01a4aa828,0x1e38aeb6360b1af3, + 0xe2280b6c20dd5232,0x25c6da63c38de1b0, + 0x8d590723948a535f,0x579c487e5a38ad0e, + 0xb0af48ec79ace837,0x2d835a9df0c6d851, + 0xdcdb1b2798182244,0xf8e431456cf88e65, + 0x8a08f0f8bf0f156b,0x1b8e9ecb641b58ff, + 0xac8b2d36eed2dac5,0xe272467e3d222f3f, + 0xd7adf884aa879177,0x5b0ed81dcc6abb0f, + 0x86ccbb52ea94baea,0x98e947129fc2b4e9, + 0xa87fea27a539e9a5,0x3f2398d747b36224, + 0xd29fe4b18e88640e,0x8eec7f0d19a03aad, + 0x83a3eeeef9153e89,0x1953cf68300424ac, + 0xa48ceaaab75a8e2b,0x5fa8c3423c052dd7, + 0xcdb02555653131b6,0x3792f412cb06794d, + 0x808e17555f3ebf11,0xe2bbd88bbee40bd0, + 0xa0b19d2ab70e6ed6,0x5b6aceaeae9d0ec4, + 0xc8de047564d20a8b,0xf245825a5a445275, + 0xfb158592be068d2e,0xeed6e2f0f0d56712, + 0x9ced737bb6c4183d,0x55464dd69685606b, + 0xc428d05aa4751e4c,0xaa97e14c3c26b886, + 0xf53304714d9265df,0xd53dd99f4b3066a8, + 0x993fe2c6d07b7fab,0xe546a8038efe4029, + 0xbf8fdb78849a5f96,0xde98520472bdd033, + 0xef73d256a5c0f77c,0x963e66858f6d4440, + 0x95a8637627989aad,0xdde7001379a44aa8, + 0xbb127c53b17ec159,0x5560c018580d5d52, + 0xe9d71b689dde71af,0xaab8f01e6e10b4a6, + 0x9226712162ab070d,0xcab3961304ca70e8, + 0xb6b00d69bb55c8d1,0x3d607b97c5fd0d22, + 0xe45c10c42a2b3b05,0x8cb89a7db77c506a, + 0x8eb98a7a9a5b04e3,0x77f3608e92adb242, + 0xb267ed1940f1c61c,0x55f038b237591ed3, + 0xdf01e85f912e37a3,0x6b6c46dec52f6688, + 0x8b61313bbabce2c6,0x2323ac4b3b3da015, + 0xae397d8aa96c1b77,0xabec975e0a0d081a, + 0xd9c7dced53c72255,0x96e7bd358c904a21, + 0x881cea14545c7575,0x7e50d64177da2e54, + 0xaa242499697392d2,0xdde50bd1d5d0b9e9, + 0xd4ad2dbfc3d07787,0x955e4ec64b44e864, + 0x84ec3c97da624ab4,0xbd5af13bef0b113e, + 0xa6274bbdd0fadd61,0xecb1ad8aeacdd58e, + 0xcfb11ead453994ba,0x67de18eda5814af2, + 0x81ceb32c4b43fcf4,0x80eacf948770ced7, + 0xa2425ff75e14fc31,0xa1258379a94d028d, + 0xcad2f7f5359a3b3e,0x96ee45813a04330, + 0xfd87b5f28300ca0d,0x8bca9d6e188853fc, + 0x9e74d1b791e07e48,0x775ea264cf55347e, + 0xc612062576589dda,0x95364afe032a819e, + 0xf79687aed3eec551,0x3a83ddbd83f52205, + 0x9abe14cd44753b52,0xc4926a9672793543, + 0xc16d9a0095928a27,0x75b7053c0f178294, + 0xf1c90080baf72cb1,0x5324c68b12dd6339, + 0x971da05074da7bee,0xd3f6fc16ebca5e04, + 0xbce5086492111aea,0x88f4bb1ca6bcf585, + 0xec1e4a7db69561a5,0x2b31e9e3d06c32e6, + 0x9392ee8e921d5d07,0x3aff322e62439fd0, + 0xb877aa3236a4b449,0x9befeb9fad487c3, + 0xe69594bec44de15b,0x4c2ebe687989a9b4, + 0x901d7cf73ab0acd9,0xf9d37014bf60a11, + 0xb424dc35095cd80f,0x538484c19ef38c95, + 0xe12e13424bb40e13,0x2865a5f206b06fba, + 0x8cbccc096f5088cb,0xf93f87b7442e45d4, + 0xafebff0bcb24aafe,0xf78f69a51539d749, + 0xdbe6fecebdedd5be,0xb573440e5a884d1c, + 0x89705f4136b4a597,0x31680a88f8953031, + 0xabcc77118461cefc,0xfdc20d2b36ba7c3e, + 0xd6bf94d5e57a42bc,0x3d32907604691b4d, + 0x8637bd05af6c69b5,0xa63f9a49c2c1b110, + 0xa7c5ac471b478423,0xfcf80dc33721d54, + 0xd1b71758e219652b,0xd3c36113404ea4a9, + 0x83126e978d4fdf3b,0x645a1cac083126ea, + 0xa3d70a3d70a3d70a,0x3d70a3d70a3d70a4, + 0xcccccccccccccccc,0xcccccccccccccccd, + 0x8000000000000000,0x0, + 0xa000000000000000,0x0, + 0xc800000000000000,0x0, + 0xfa00000000000000,0x0, + 0x9c40000000000000,0x0, + 0xc350000000000000,0x0, + 0xf424000000000000,0x0, + 0x9896800000000000,0x0, + 0xbebc200000000000,0x0, + 0xee6b280000000000,0x0, + 0x9502f90000000000,0x0, + 0xba43b74000000000,0x0, + 0xe8d4a51000000000,0x0, + 0x9184e72a00000000,0x0, + 0xb5e620f480000000,0x0, + 0xe35fa931a0000000,0x0, + 0x8e1bc9bf04000000,0x0, + 0xb1a2bc2ec5000000,0x0, + 0xde0b6b3a76400000,0x0, + 0x8ac7230489e80000,0x0, + 0xad78ebc5ac620000,0x0, + 0xd8d726b7177a8000,0x0, + 0x878678326eac9000,0x0, + 0xa968163f0a57b400,0x0, + 0xd3c21bcecceda100,0x0, + 0x84595161401484a0,0x0, + 0xa56fa5b99019a5c8,0x0, + 0xcecb8f27f4200f3a,0x0, + 0x813f3978f8940984,0x4000000000000000, + 0xa18f07d736b90be5,0x5000000000000000, + 0xc9f2c9cd04674ede,0xa400000000000000, + 0xfc6f7c4045812296,0x4d00000000000000, + 0x9dc5ada82b70b59d,0xf020000000000000, + 0xc5371912364ce305,0x6c28000000000000, + 0xf684df56c3e01bc6,0xc732000000000000, + 0x9a130b963a6c115c,0x3c7f400000000000, + 0xc097ce7bc90715b3,0x4b9f100000000000, + 0xf0bdc21abb48db20,0x1e86d40000000000, + 0x96769950b50d88f4,0x1314448000000000, + 0xbc143fa4e250eb31,0x17d955a000000000, + 0xeb194f8e1ae525fd,0x5dcfab0800000000, + 0x92efd1b8d0cf37be,0x5aa1cae500000000, + 0xb7abc627050305ad,0xf14a3d9e40000000, + 0xe596b7b0c643c719,0x6d9ccd05d0000000, + 0x8f7e32ce7bea5c6f,0xe4820023a2000000, + 0xb35dbf821ae4f38b,0xdda2802c8a800000, + 0xe0352f62a19e306e,0xd50b2037ad200000, + 0x8c213d9da502de45,0x4526f422cc340000, + 0xaf298d050e4395d6,0x9670b12b7f410000, + 0xdaf3f04651d47b4c,0x3c0cdd765f114000, + 0x88d8762bf324cd0f,0xa5880a69fb6ac800, + 0xab0e93b6efee0053,0x8eea0d047a457a00, + 0xd5d238a4abe98068,0x72a4904598d6d880, + 0x85a36366eb71f041,0x47a6da2b7f864750, + 0xa70c3c40a64e6c51,0x999090b65f67d924, + 0xd0cf4b50cfe20765,0xfff4b4e3f741cf6d, + 0x82818f1281ed449f,0xbff8f10e7a8921a4, + 0xa321f2d7226895c7,0xaff72d52192b6a0d, + 0xcbea6f8ceb02bb39,0x9bf4f8a69f764490, + 0xfee50b7025c36a08,0x2f236d04753d5b4, + 0x9f4f2726179a2245,0x1d762422c946590, + 0xc722f0ef9d80aad6,0x424d3ad2b7b97ef5, + 0xf8ebad2b84e0d58b,0xd2e0898765a7deb2, + 0x9b934c3b330c8577,0x63cc55f49f88eb2f, + 0xc2781f49ffcfa6d5,0x3cbf6b71c76b25fb, + 0xf316271c7fc3908a,0x8bef464e3945ef7a, + 0x97edd871cfda3a56,0x97758bf0e3cbb5ac, + 0xbde94e8e43d0c8ec,0x3d52eeed1cbea317, + 0xed63a231d4c4fb27,0x4ca7aaa863ee4bdd, + 0x945e455f24fb1cf8,0x8fe8caa93e74ef6a, + 0xb975d6b6ee39e436,0xb3e2fd538e122b44, + 0xe7d34c64a9c85d44,0x60dbbca87196b616, + 0x90e40fbeea1d3a4a,0xbc8955e946fe31cd, + 0xb51d13aea4a488dd,0x6babab6398bdbe41, + 0xe264589a4dcdab14,0xc696963c7eed2dd1, + 0x8d7eb76070a08aec,0xfc1e1de5cf543ca2, + 0xb0de65388cc8ada8,0x3b25a55f43294bcb, + 0xdd15fe86affad912,0x49ef0eb713f39ebe, + 0x8a2dbf142dfcc7ab,0x6e3569326c784337, + 0xacb92ed9397bf996,0x49c2c37f07965404, + 0xd7e77a8f87daf7fb,0xdc33745ec97be906, + 0x86f0ac99b4e8dafd,0x69a028bb3ded71a3, + 0xa8acd7c0222311bc,0xc40832ea0d68ce0c, + 0xd2d80db02aabd62b,0xf50a3fa490c30190, + 0x83c7088e1aab65db,0x792667c6da79e0fa, + 0xa4b8cab1a1563f52,0x577001b891185938, + 0xcde6fd5e09abcf26,0xed4c0226b55e6f86, + 0x80b05e5ac60b6178,0x544f8158315b05b4, + 0xa0dc75f1778e39d6,0x696361ae3db1c721, + 0xc913936dd571c84c,0x3bc3a19cd1e38e9, + 0xfb5878494ace3a5f,0x4ab48a04065c723, + 0x9d174b2dcec0e47b,0x62eb0d64283f9c76, + 0xc45d1df942711d9a,0x3ba5d0bd324f8394, + 0xf5746577930d6500,0xca8f44ec7ee36479, + 0x9968bf6abbe85f20,0x7e998b13cf4e1ecb, + 0xbfc2ef456ae276e8,0x9e3fedd8c321a67e, + 0xefb3ab16c59b14a2,0xc5cfe94ef3ea101e, + 0x95d04aee3b80ece5,0xbba1f1d158724a12, + 0xbb445da9ca61281f,0x2a8a6e45ae8edc97, + 0xea1575143cf97226,0xf52d09d71a3293bd, + 0x924d692ca61be758,0x593c2626705f9c56, + 0xb6e0c377cfa2e12e,0x6f8b2fb00c77836c, + 0xe498f455c38b997a,0xb6dfb9c0f956447, + 0x8edf98b59a373fec,0x4724bd4189bd5eac, + 0xb2977ee300c50fe7,0x58edec91ec2cb657, + 0xdf3d5e9bc0f653e1,0x2f2967b66737e3ed, + 0x8b865b215899f46c,0xbd79e0d20082ee74, + 0xae67f1e9aec07187,0xecd8590680a3aa11, + 0xda01ee641a708de9,0xe80e6f4820cc9495, + 0x884134fe908658b2,0x3109058d147fdcdd, + 0xaa51823e34a7eede,0xbd4b46f0599fd415, + 0xd4e5e2cdc1d1ea96,0x6c9e18ac7007c91a, + 0x850fadc09923329e,0x3e2cf6bc604ddb0, + 0xa6539930bf6bff45,0x84db8346b786151c, + 0xcfe87f7cef46ff16,0xe612641865679a63, + 0x81f14fae158c5f6e,0x4fcb7e8f3f60c07e, + 0xa26da3999aef7749,0xe3be5e330f38f09d, + 0xcb090c8001ab551c,0x5cadf5bfd3072cc5, + 0xfdcb4fa002162a63,0x73d9732fc7c8f7f6, + 0x9e9f11c4014dda7e,0x2867e7fddcdd9afa, + 0xc646d63501a1511d,0xb281e1fd541501b8, + 0xf7d88bc24209a565,0x1f225a7ca91a4226, + 0x9ae757596946075f,0x3375788de9b06958, + 0xc1a12d2fc3978937,0x52d6b1641c83ae, + 0xf209787bb47d6b84,0xc0678c5dbd23a49a, + 0x9745eb4d50ce6332,0xf840b7ba963646e0, + 0xbd176620a501fbff,0xb650e5a93bc3d898, + 0xec5d3fa8ce427aff,0xa3e51f138ab4cebe, + 0x93ba47c980e98cdf,0xc66f336c36b10137, + 0xb8a8d9bbe123f017,0xb80b0047445d4184, + 0xe6d3102ad96cec1d,0xa60dc059157491e5, + 0x9043ea1ac7e41392,0x87c89837ad68db2f, + 0xb454e4a179dd1877,0x29babe4598c311fb, + 0xe16a1dc9d8545e94,0xf4296dd6fef3d67a, + 0x8ce2529e2734bb1d,0x1899e4a65f58660c, + 0xb01ae745b101e9e4,0x5ec05dcff72e7f8f, + 0xdc21a1171d42645d,0x76707543f4fa1f73, + 0x899504ae72497eba,0x6a06494a791c53a8, + 0xabfa45da0edbde69,0x487db9d17636892, + 0xd6f8d7509292d603,0x45a9d2845d3c42b6, + 0x865b86925b9bc5c2,0xb8a2392ba45a9b2, + 0xa7f26836f282b732,0x8e6cac7768d7141e, + 0xd1ef0244af2364ff,0x3207d795430cd926, + 0x8335616aed761f1f,0x7f44e6bd49e807b8, + 0xa402b9c5a8d3a6e7,0x5f16206c9c6209a6, + 0xcd036837130890a1,0x36dba887c37a8c0f, + 0x802221226be55a64,0xc2494954da2c9789, + 0xa02aa96b06deb0fd,0xf2db9baa10b7bd6c, + 0xc83553c5c8965d3d,0x6f92829494e5acc7, + 0xfa42a8b73abbf48c,0xcb772339ba1f17f9, + 0x9c69a97284b578d7,0xff2a760414536efb, + 0xc38413cf25e2d70d,0xfef5138519684aba, + 0xf46518c2ef5b8cd1,0x7eb258665fc25d69, + 0x98bf2f79d5993802,0xef2f773ffbd97a61, + 0xbeeefb584aff8603,0xaafb550ffacfd8fa, + 0xeeaaba2e5dbf6784,0x95ba2a53f983cf38, + 0x952ab45cfa97a0b2,0xdd945a747bf26183, + 0xba756174393d88df,0x94f971119aeef9e4, + 0xe912b9d1478ceb17,0x7a37cd5601aab85d, + 0x91abb422ccb812ee,0xac62e055c10ab33a, + 0xb616a12b7fe617aa,0x577b986b314d6009, + 0xe39c49765fdf9d94,0xed5a7e85fda0b80b, + 0x8e41ade9fbebc27d,0x14588f13be847307, + 0xb1d219647ae6b31c,0x596eb2d8ae258fc8, + 0xde469fbd99a05fe3,0x6fca5f8ed9aef3bb, + 0x8aec23d680043bee,0x25de7bb9480d5854, + 0xada72ccc20054ae9,0xaf561aa79a10ae6a, + 0xd910f7ff28069da4,0x1b2ba1518094da04, + 0x87aa9aff79042286,0x90fb44d2f05d0842, + 0xa99541bf57452b28,0x353a1607ac744a53, + 0xd3fa922f2d1675f2,0x42889b8997915ce8, + 0x847c9b5d7c2e09b7,0x69956135febada11, + 0xa59bc234db398c25,0x43fab9837e699095, + 0xcf02b2c21207ef2e,0x94f967e45e03f4bb, + 0x8161afb94b44f57d,0x1d1be0eebac278f5, + 0xa1ba1ba79e1632dc,0x6462d92a69731732, + 0xca28a291859bbf93,0x7d7b8f7503cfdcfe, + 0xfcb2cb35e702af78,0x5cda735244c3d43e, + 0x9defbf01b061adab,0x3a0888136afa64a7, + 0xc56baec21c7a1916,0x88aaa1845b8fdd0, + 0xf6c69a72a3989f5b,0x8aad549e57273d45, + 0x9a3c2087a63f6399,0x36ac54e2f678864b, + 0xc0cb28a98fcf3c7f,0x84576a1bb416a7dd, + 0xf0fdf2d3f3c30b9f,0x656d44a2a11c51d5, + 0x969eb7c47859e743,0x9f644ae5a4b1b325, + 0xbc4665b596706114,0x873d5d9f0dde1fee, + 0xeb57ff22fc0c7959,0xa90cb506d155a7ea, + 0x9316ff75dd87cbd8,0x9a7f12442d588f2, + 0xb7dcbf5354e9bece,0xc11ed6d538aeb2f, + 0xe5d3ef282a242e81,0x8f1668c8a86da5fa, + 0x8fa475791a569d10,0xf96e017d694487bc, + 0xb38d92d760ec4455,0x37c981dcc395a9ac, + 0xe070f78d3927556a,0x85bbe253f47b1417, + 0x8c469ab843b89562,0x93956d7478ccec8e, + 0xaf58416654a6babb,0x387ac8d1970027b2, + 0xdb2e51bfe9d0696a,0x6997b05fcc0319e, + 0x88fcf317f22241e2,0x441fece3bdf81f03, + 0xab3c2fddeeaad25a,0xd527e81cad7626c3, + 0xd60b3bd56a5586f1,0x8a71e223d8d3b074, + 0x85c7056562757456,0xf6872d5667844e49, + 0xa738c6bebb12d16c,0xb428f8ac016561db, + 0xd106f86e69d785c7,0xe13336d701beba52, + 0x82a45b450226b39c,0xecc0024661173473, + 0xa34d721642b06084,0x27f002d7f95d0190, + 0xcc20ce9bd35c78a5,0x31ec038df7b441f4, + 0xff290242c83396ce,0x7e67047175a15271, + 0x9f79a169bd203e41,0xf0062c6e984d386, + 0xc75809c42c684dd1,0x52c07b78a3e60868, + 0xf92e0c3537826145,0xa7709a56ccdf8a82, + 0x9bbcc7a142b17ccb,0x88a66076400bb691, + 0xc2abf989935ddbfe,0x6acff893d00ea435, + 0xf356f7ebf83552fe,0x583f6b8c4124d43, + 0x98165af37b2153de,0xc3727a337a8b704a, + 0xbe1bf1b059e9a8d6,0x744f18c0592e4c5c, + 0xeda2ee1c7064130c,0x1162def06f79df73, + 0x9485d4d1c63e8be7,0x8addcb5645ac2ba8, + 0xb9a74a0637ce2ee1,0x6d953e2bd7173692, + 0xe8111c87c5c1ba99,0xc8fa8db6ccdd0437, + 0x910ab1d4db9914a0,0x1d9c9892400a22a2, + 0xb54d5e4a127f59c8,0x2503beb6d00cab4b, + 0xe2a0b5dc971f303a,0x2e44ae64840fd61d, + 0x8da471a9de737e24,0x5ceaecfed289e5d2, + 0xb10d8e1456105dad,0x7425a83e872c5f47, + 0xdd50f1996b947518,0xd12f124e28f77719, + 0x8a5296ffe33cc92f,0x82bd6b70d99aaa6f, + 0xace73cbfdc0bfb7b,0x636cc64d1001550b, + 0xd8210befd30efa5a,0x3c47f7e05401aa4e, + 0x8714a775e3e95c78,0x65acfaec34810a71, + 0xa8d9d1535ce3b396,0x7f1839a741a14d0d, + 0xd31045a8341ca07c,0x1ede48111209a050, + 0x83ea2b892091e44d,0x934aed0aab460432, + 0xa4e4b66b68b65d60,0xf81da84d5617853f, + 0xce1de40642e3f4b9,0x36251260ab9d668e, + 0x80d2ae83e9ce78f3,0xc1d72b7c6b426019, + 0xa1075a24e4421730,0xb24cf65b8612f81f, + 0xc94930ae1d529cfc,0xdee033f26797b627, + 0xfb9b7cd9a4a7443c,0x169840ef017da3b1, + 0x9d412e0806e88aa5,0x8e1f289560ee864e, + 0xc491798a08a2ad4e,0xf1a6f2bab92a27e2, + 0xf5b5d7ec8acb58a2,0xae10af696774b1db, + 0x9991a6f3d6bf1765,0xacca6da1e0a8ef29, + 0xbff610b0cc6edd3f,0x17fd090a58d32af3, + 0xeff394dcff8a948e,0xddfc4b4cef07f5b0, + 0x95f83d0a1fb69cd9,0x4abdaf101564f98e, + 0xbb764c4ca7a4440f,0x9d6d1ad41abe37f1, + 0xea53df5fd18d5513,0x84c86189216dc5ed, + 0x92746b9be2f8552c,0x32fd3cf5b4e49bb4, + 0xb7118682dbb66a77,0x3fbc8c33221dc2a1, + 0xe4d5e82392a40515,0xfabaf3feaa5334a, + 0x8f05b1163ba6832d,0x29cb4d87f2a7400e, + 0xb2c71d5bca9023f8,0x743e20e9ef511012, + 0xdf78e4b2bd342cf6,0x914da9246b255416, + 0x8bab8eefb6409c1a,0x1ad089b6c2f7548e, + 0xae9672aba3d0c320,0xa184ac2473b529b1, + 0xda3c0f568cc4f3e8,0xc9e5d72d90a2741e, + 0x8865899617fb1871,0x7e2fa67c7a658892, + 0xaa7eebfb9df9de8d,0xddbb901b98feeab7, + 0xd51ea6fa85785631,0x552a74227f3ea565, + 0x8533285c936b35de,0xd53a88958f87275f, + 0xa67ff273b8460356,0x8a892abaf368f137, + 0xd01fef10a657842c,0x2d2b7569b0432d85, + 0x8213f56a67f6b29b,0x9c3b29620e29fc73, + 0xa298f2c501f45f42,0x8349f3ba91b47b8f, + 0xcb3f2f7642717713,0x241c70a936219a73, + 0xfe0efb53d30dd4d7,0xed238cd383aa0110, + 0x9ec95d1463e8a506,0xf4363804324a40aa, + 0xc67bb4597ce2ce48,0xb143c6053edcd0d5, + 0xf81aa16fdc1b81da,0xdd94b7868e94050a, + 0x9b10a4e5e9913128,0xca7cf2b4191c8326, + 0xc1d4ce1f63f57d72,0xfd1c2f611f63a3f0, + 0xf24a01a73cf2dccf,0xbc633b39673c8cec, + 0x976e41088617ca01,0xd5be0503e085d813, + 0xbd49d14aa79dbc82,0x4b2d8644d8a74e18, + 0xec9c459d51852ba2,0xddf8e7d60ed1219e, + 0x93e1ab8252f33b45,0xcabb90e5c942b503, + 0xb8da1662e7b00a17,0x3d6a751f3b936243, + 0xe7109bfba19c0c9d,0xcc512670a783ad4, + 0x906a617d450187e2,0x27fb2b80668b24c5, + 0xb484f9dc9641e9da,0xb1f9f660802dedf6, + 0xe1a63853bbd26451,0x5e7873f8a0396973, + 0x8d07e33455637eb2,0xdb0b487b6423e1e8, + 0xb049dc016abc5e5f,0x91ce1a9a3d2cda62, + 0xdc5c5301c56b75f7,0x7641a140cc7810fb, + 0x89b9b3e11b6329ba,0xa9e904c87fcb0a9d, + 0xac2820d9623bf429,0x546345fa9fbdcd44, + 0xd732290fbacaf133,0xa97c177947ad4095, + 0x867f59a9d4bed6c0,0x49ed8eabcccc485d, + 0xa81f301449ee8c70,0x5c68f256bfff5a74, + 0xd226fc195c6a2f8c,0x73832eec6fff3111, + 0x83585d8fd9c25db7,0xc831fd53c5ff7eab, + 0xa42e74f3d032f525,0xba3e7ca8b77f5e55, + 0xcd3a1230c43fb26f,0x28ce1bd2e55f35eb, + 0x80444b5e7aa7cf85,0x7980d163cf5b81b3, + 0xa0555e361951c366,0xd7e105bcc332621f, + 0xc86ab5c39fa63440,0x8dd9472bf3fefaa7, + 0xfa856334878fc150,0xb14f98f6f0feb951, + 0x9c935e00d4b9d8d2,0x6ed1bf9a569f33d3, + 0xc3b8358109e84f07,0xa862f80ec4700c8, + 0xf4a642e14c6262c8,0xcd27bb612758c0fa, + 0x98e7e9cccfbd7dbd,0x8038d51cb897789c, + 0xbf21e44003acdd2c,0xe0470a63e6bd56c3, + 0xeeea5d5004981478,0x1858ccfce06cac74, + 0x95527a5202df0ccb,0xf37801e0c43ebc8, + 0xbaa718e68396cffd,0xd30560258f54e6ba, + 0xe950df20247c83fd,0x47c6b82ef32a2069, + 0x91d28b7416cdd27e,0x4cdc331d57fa5441, + 0xb6472e511c81471d,0xe0133fe4adf8e952, + 0xe3d8f9e563a198e5,0x58180fddd97723a6, + 0x8e679c2f5e44ff8f,0x570f09eaa7ea7648,}; }; template -const uint64_t powers_template::power_of_five_128[number_of_entries] = { - 0xeef453d6923bd65a,0x113faa2906a13b3f, - 0x9558b4661b6565f8,0x4ac7ca59a424c507, - 0xbaaee17fa23ebf76,0x5d79bcf00d2df649, - 0xe95a99df8ace6f53,0xf4d82c2c107973dc, - 0x91d8a02bb6c10594,0x79071b9b8a4be869, - 0xb64ec836a47146f9,0x9748e2826cdee284, - 0xe3e27a444d8d98b7,0xfd1b1b2308169b25, - 0x8e6d8c6ab0787f72,0xfe30f0f5e50e20f7, - 0xb208ef855c969f4f,0xbdbd2d335e51a935, - 0xde8b2b66b3bc4723,0xad2c788035e61382, - 0x8b16fb203055ac76,0x4c3bcb5021afcc31, - 0xaddcb9e83c6b1793,0xdf4abe242a1bbf3d, - 0xd953e8624b85dd78,0xd71d6dad34a2af0d, - 0x87d4713d6f33aa6b,0x8672648c40e5ad68, - 0xa9c98d8ccb009506,0x680efdaf511f18c2, - 0xd43bf0effdc0ba48,0x212bd1b2566def2, - 0x84a57695fe98746d,0x14bb630f7604b57, - 0xa5ced43b7e3e9188,0x419ea3bd35385e2d, - 0xcf42894a5dce35ea,0x52064cac828675b9, - 0x818995ce7aa0e1b2,0x7343efebd1940993, - 0xa1ebfb4219491a1f,0x1014ebe6c5f90bf8, - 0xca66fa129f9b60a6,0xd41a26e077774ef6, - 0xfd00b897478238d0,0x8920b098955522b4, - 0x9e20735e8cb16382,0x55b46e5f5d5535b0, - 0xc5a890362fddbc62,0xeb2189f734aa831d, - 0xf712b443bbd52b7b,0xa5e9ec7501d523e4, - 0x9a6bb0aa55653b2d,0x47b233c92125366e, - 0xc1069cd4eabe89f8,0x999ec0bb696e840a, - 0xf148440a256e2c76,0xc00670ea43ca250d, - 0x96cd2a865764dbca,0x380406926a5e5728, - 0xbc807527ed3e12bc,0xc605083704f5ecf2, - 0xeba09271e88d976b,0xf7864a44c633682e, - 0x93445b8731587ea3,0x7ab3ee6afbe0211d, - 0xb8157268fdae9e4c,0x5960ea05bad82964, - 0xe61acf033d1a45df,0x6fb92487298e33bd, - 0x8fd0c16206306bab,0xa5d3b6d479f8e056, - 0xb3c4f1ba87bc8696,0x8f48a4899877186c, - 0xe0b62e2929aba83c,0x331acdabfe94de87, - 0x8c71dcd9ba0b4925,0x9ff0c08b7f1d0b14, - 0xaf8e5410288e1b6f,0x7ecf0ae5ee44dd9, - 0xdb71e91432b1a24a,0xc9e82cd9f69d6150, - 0x892731ac9faf056e,0xbe311c083a225cd2, - 0xab70fe17c79ac6ca,0x6dbd630a48aaf406, - 0xd64d3d9db981787d,0x92cbbccdad5b108, - 0x85f0468293f0eb4e,0x25bbf56008c58ea5, - 0xa76c582338ed2621,0xaf2af2b80af6f24e, - 0xd1476e2c07286faa,0x1af5af660db4aee1, - 0x82cca4db847945ca,0x50d98d9fc890ed4d, - 0xa37fce126597973c,0xe50ff107bab528a0, - 0xcc5fc196fefd7d0c,0x1e53ed49a96272c8, - 0xff77b1fcbebcdc4f,0x25e8e89c13bb0f7a, - 0x9faacf3df73609b1,0x77b191618c54e9ac, - 0xc795830d75038c1d,0xd59df5b9ef6a2417, - 0xf97ae3d0d2446f25,0x4b0573286b44ad1d, - 0x9becce62836ac577,0x4ee367f9430aec32, - 0xc2e801fb244576d5,0x229c41f793cda73f, - 0xf3a20279ed56d48a,0x6b43527578c1110f, - 0x9845418c345644d6,0x830a13896b78aaa9, - 0xbe5691ef416bd60c,0x23cc986bc656d553, - 0xedec366b11c6cb8f,0x2cbfbe86b7ec8aa8, - 0x94b3a202eb1c3f39,0x7bf7d71432f3d6a9, - 0xb9e08a83a5e34f07,0xdaf5ccd93fb0cc53, - 0xe858ad248f5c22c9,0xd1b3400f8f9cff68, - 0x91376c36d99995be,0x23100809b9c21fa1, - 0xb58547448ffffb2d,0xabd40a0c2832a78a, - 0xe2e69915b3fff9f9,0x16c90c8f323f516c, - 0x8dd01fad907ffc3b,0xae3da7d97f6792e3, - 0xb1442798f49ffb4a,0x99cd11cfdf41779c, - 0xdd95317f31c7fa1d,0x40405643d711d583, - 0x8a7d3eef7f1cfc52,0x482835ea666b2572, - 0xad1c8eab5ee43b66,0xda3243650005eecf, - 0xd863b256369d4a40,0x90bed43e40076a82, - 0x873e4f75e2224e68,0x5a7744a6e804a291, - 0xa90de3535aaae202,0x711515d0a205cb36, - 0xd3515c2831559a83,0xd5a5b44ca873e03, - 0x8412d9991ed58091,0xe858790afe9486c2, - 0xa5178fff668ae0b6,0x626e974dbe39a872, - 0xce5d73ff402d98e3,0xfb0a3d212dc8128f, - 0x80fa687f881c7f8e,0x7ce66634bc9d0b99, - 0xa139029f6a239f72,0x1c1fffc1ebc44e80, - 0xc987434744ac874e,0xa327ffb266b56220, - 0xfbe9141915d7a922,0x4bf1ff9f0062baa8, - 0x9d71ac8fada6c9b5,0x6f773fc3603db4a9, - 0xc4ce17b399107c22,0xcb550fb4384d21d3, - 0xf6019da07f549b2b,0x7e2a53a146606a48, - 0x99c102844f94e0fb,0x2eda7444cbfc426d, - 0xc0314325637a1939,0xfa911155fefb5308, - 0xf03d93eebc589f88,0x793555ab7eba27ca, - 0x96267c7535b763b5,0x4bc1558b2f3458de, - 0xbbb01b9283253ca2,0x9eb1aaedfb016f16, - 0xea9c227723ee8bcb,0x465e15a979c1cadc, - 0x92a1958a7675175f,0xbfacd89ec191ec9, - 0xb749faed14125d36,0xcef980ec671f667b, - 0xe51c79a85916f484,0x82b7e12780e7401a, - 0x8f31cc0937ae58d2,0xd1b2ecb8b0908810, - 0xb2fe3f0b8599ef07,0x861fa7e6dcb4aa15, - 0xdfbdcece67006ac9,0x67a791e093e1d49a, - 0x8bd6a141006042bd,0xe0c8bb2c5c6d24e0, - 0xaecc49914078536d,0x58fae9f773886e18, - 0xda7f5bf590966848,0xaf39a475506a899e, - 0x888f99797a5e012d,0x6d8406c952429603, - 0xaab37fd7d8f58178,0xc8e5087ba6d33b83, - 0xd5605fcdcf32e1d6,0xfb1e4a9a90880a64, - 0x855c3be0a17fcd26,0x5cf2eea09a55067f, - 0xa6b34ad8c9dfc06f,0xf42faa48c0ea481e, - 0xd0601d8efc57b08b,0xf13b94daf124da26, - 0x823c12795db6ce57,0x76c53d08d6b70858, - 0xa2cb1717b52481ed,0x54768c4b0c64ca6e, - 0xcb7ddcdda26da268,0xa9942f5dcf7dfd09, - 0xfe5d54150b090b02,0xd3f93b35435d7c4c, - 0x9efa548d26e5a6e1,0xc47bc5014a1a6daf, - 0xc6b8e9b0709f109a,0x359ab6419ca1091b, - 0xf867241c8cc6d4c0,0xc30163d203c94b62, - 0x9b407691d7fc44f8,0x79e0de63425dcf1d, - 0xc21094364dfb5636,0x985915fc12f542e4, - 0xf294b943e17a2bc4,0x3e6f5b7b17b2939d, - 0x979cf3ca6cec5b5a,0xa705992ceecf9c42, - 0xbd8430bd08277231,0x50c6ff782a838353, - 0xece53cec4a314ebd,0xa4f8bf5635246428, - 0x940f4613ae5ed136,0x871b7795e136be99, - 0xb913179899f68584,0x28e2557b59846e3f, - 0xe757dd7ec07426e5,0x331aeada2fe589cf, - 0x9096ea6f3848984f,0x3ff0d2c85def7621, - 0xb4bca50b065abe63,0xfed077a756b53a9, - 0xe1ebce4dc7f16dfb,0xd3e8495912c62894, - 0x8d3360f09cf6e4bd,0x64712dd7abbbd95c, - 0xb080392cc4349dec,0xbd8d794d96aacfb3, - 0xdca04777f541c567,0xecf0d7a0fc5583a0, - 0x89e42caaf9491b60,0xf41686c49db57244, - 0xac5d37d5b79b6239,0x311c2875c522ced5, - 0xd77485cb25823ac7,0x7d633293366b828b, - 0x86a8d39ef77164bc,0xae5dff9c02033197, - 0xa8530886b54dbdeb,0xd9f57f830283fdfc, - 0xd267caa862a12d66,0xd072df63c324fd7b, - 0x8380dea93da4bc60,0x4247cb9e59f71e6d, - 0xa46116538d0deb78,0x52d9be85f074e608, - 0xcd795be870516656,0x67902e276c921f8b, - 0x806bd9714632dff6,0xba1cd8a3db53b6, - 0xa086cfcd97bf97f3,0x80e8a40eccd228a4, - 0xc8a883c0fdaf7df0,0x6122cd128006b2cd, - 0xfad2a4b13d1b5d6c,0x796b805720085f81, - 0x9cc3a6eec6311a63,0xcbe3303674053bb0, - 0xc3f490aa77bd60fc,0xbedbfc4411068a9c, - 0xf4f1b4d515acb93b,0xee92fb5515482d44, - 0x991711052d8bf3c5,0x751bdd152d4d1c4a, - 0xbf5cd54678eef0b6,0xd262d45a78a0635d, - 0xef340a98172aace4,0x86fb897116c87c34, - 0x9580869f0e7aac0e,0xd45d35e6ae3d4da0, - 0xbae0a846d2195712,0x8974836059cca109, - 0xe998d258869facd7,0x2bd1a438703fc94b, - 0x91ff83775423cc06,0x7b6306a34627ddcf, - 0xb67f6455292cbf08,0x1a3bc84c17b1d542, - 0xe41f3d6a7377eeca,0x20caba5f1d9e4a93, - 0x8e938662882af53e,0x547eb47b7282ee9c, - 0xb23867fb2a35b28d,0xe99e619a4f23aa43, - 0xdec681f9f4c31f31,0x6405fa00e2ec94d4, - 0x8b3c113c38f9f37e,0xde83bc408dd3dd04, - 0xae0b158b4738705e,0x9624ab50b148d445, - 0xd98ddaee19068c76,0x3badd624dd9b0957, - 0x87f8a8d4cfa417c9,0xe54ca5d70a80e5d6, - 0xa9f6d30a038d1dbc,0x5e9fcf4ccd211f4c, - 0xd47487cc8470652b,0x7647c3200069671f, - 0x84c8d4dfd2c63f3b,0x29ecd9f40041e073, - 0xa5fb0a17c777cf09,0xf468107100525890, - 0xcf79cc9db955c2cc,0x7182148d4066eeb4, - 0x81ac1fe293d599bf,0xc6f14cd848405530, - 0xa21727db38cb002f,0xb8ada00e5a506a7c, - 0xca9cf1d206fdc03b,0xa6d90811f0e4851c, - 0xfd442e4688bd304a,0x908f4a166d1da663, - 0x9e4a9cec15763e2e,0x9a598e4e043287fe, - 0xc5dd44271ad3cdba,0x40eff1e1853f29fd, - 0xf7549530e188c128,0xd12bee59e68ef47c, - 0x9a94dd3e8cf578b9,0x82bb74f8301958ce, - 0xc13a148e3032d6e7,0xe36a52363c1faf01, - 0xf18899b1bc3f8ca1,0xdc44e6c3cb279ac1, - 0x96f5600f15a7b7e5,0x29ab103a5ef8c0b9, - 0xbcb2b812db11a5de,0x7415d448f6b6f0e7, - 0xebdf661791d60f56,0x111b495b3464ad21, - 0x936b9fcebb25c995,0xcab10dd900beec34, - 0xb84687c269ef3bfb,0x3d5d514f40eea742, - 0xe65829b3046b0afa,0xcb4a5a3112a5112, - 0x8ff71a0fe2c2e6dc,0x47f0e785eaba72ab, - 0xb3f4e093db73a093,0x59ed216765690f56, - 0xe0f218b8d25088b8,0x306869c13ec3532c, - 0x8c974f7383725573,0x1e414218c73a13fb, - 0xafbd2350644eeacf,0xe5d1929ef90898fa, - 0xdbac6c247d62a583,0xdf45f746b74abf39, - 0x894bc396ce5da772,0x6b8bba8c328eb783, - 0xab9eb47c81f5114f,0x66ea92f3f326564, - 0xd686619ba27255a2,0xc80a537b0efefebd, - 0x8613fd0145877585,0xbd06742ce95f5f36, - 0xa798fc4196e952e7,0x2c48113823b73704, - 0xd17f3b51fca3a7a0,0xf75a15862ca504c5, - 0x82ef85133de648c4,0x9a984d73dbe722fb, - 0xa3ab66580d5fdaf5,0xc13e60d0d2e0ebba, - 0xcc963fee10b7d1b3,0x318df905079926a8, - 0xffbbcfe994e5c61f,0xfdf17746497f7052, - 0x9fd561f1fd0f9bd3,0xfeb6ea8bedefa633, - 0xc7caba6e7c5382c8,0xfe64a52ee96b8fc0, - 0xf9bd690a1b68637b,0x3dfdce7aa3c673b0, - 0x9c1661a651213e2d,0x6bea10ca65c084e, - 0xc31bfa0fe5698db8,0x486e494fcff30a62, - 0xf3e2f893dec3f126,0x5a89dba3c3efccfa, - 0x986ddb5c6b3a76b7,0xf89629465a75e01c, - 0xbe89523386091465,0xf6bbb397f1135823, - 0xee2ba6c0678b597f,0x746aa07ded582e2c, - 0x94db483840b717ef,0xa8c2a44eb4571cdc, - 0xba121a4650e4ddeb,0x92f34d62616ce413, - 0xe896a0d7e51e1566,0x77b020baf9c81d17, - 0x915e2486ef32cd60,0xace1474dc1d122e, - 0xb5b5ada8aaff80b8,0xd819992132456ba, - 0xe3231912d5bf60e6,0x10e1fff697ed6c69, - 0x8df5efabc5979c8f,0xca8d3ffa1ef463c1, - 0xb1736b96b6fd83b3,0xbd308ff8a6b17cb2, - 0xddd0467c64bce4a0,0xac7cb3f6d05ddbde, - 0x8aa22c0dbef60ee4,0x6bcdf07a423aa96b, - 0xad4ab7112eb3929d,0x86c16c98d2c953c6, - 0xd89d64d57a607744,0xe871c7bf077ba8b7, - 0x87625f056c7c4a8b,0x11471cd764ad4972, - 0xa93af6c6c79b5d2d,0xd598e40d3dd89bcf, - 0xd389b47879823479,0x4aff1d108d4ec2c3, - 0x843610cb4bf160cb,0xcedf722a585139ba, - 0xa54394fe1eedb8fe,0xc2974eb4ee658828, - 0xce947a3da6a9273e,0x733d226229feea32, - 0x811ccc668829b887,0x806357d5a3f525f, - 0xa163ff802a3426a8,0xca07c2dcb0cf26f7, - 0xc9bcff6034c13052,0xfc89b393dd02f0b5, - 0xfc2c3f3841f17c67,0xbbac2078d443ace2, - 0x9d9ba7832936edc0,0xd54b944b84aa4c0d, - 0xc5029163f384a931,0xa9e795e65d4df11, - 0xf64335bcf065d37d,0x4d4617b5ff4a16d5, - 0x99ea0196163fa42e,0x504bced1bf8e4e45, - 0xc06481fb9bcf8d39,0xe45ec2862f71e1d6, - 0xf07da27a82c37088,0x5d767327bb4e5a4c, - 0x964e858c91ba2655,0x3a6a07f8d510f86f, - 0xbbe226efb628afea,0x890489f70a55368b, - 0xeadab0aba3b2dbe5,0x2b45ac74ccea842e, - 0x92c8ae6b464fc96f,0x3b0b8bc90012929d, - 0xb77ada0617e3bbcb,0x9ce6ebb40173744, - 0xe55990879ddcaabd,0xcc420a6a101d0515, - 0x8f57fa54c2a9eab6,0x9fa946824a12232d, - 0xb32df8e9f3546564,0x47939822dc96abf9, - 0xdff9772470297ebd,0x59787e2b93bc56f7, - 0x8bfbea76c619ef36,0x57eb4edb3c55b65a, - 0xaefae51477a06b03,0xede622920b6b23f1, - 0xdab99e59958885c4,0xe95fab368e45eced, - 0x88b402f7fd75539b,0x11dbcb0218ebb414, - 0xaae103b5fcd2a881,0xd652bdc29f26a119, - 0xd59944a37c0752a2,0x4be76d3346f0495f, - 0x857fcae62d8493a5,0x6f70a4400c562ddb, - 0xa6dfbd9fb8e5b88e,0xcb4ccd500f6bb952, - 0xd097ad07a71f26b2,0x7e2000a41346a7a7, - 0x825ecc24c873782f,0x8ed400668c0c28c8, - 0xa2f67f2dfa90563b,0x728900802f0f32fa, - 0xcbb41ef979346bca,0x4f2b40a03ad2ffb9, - 0xfea126b7d78186bc,0xe2f610c84987bfa8, - 0x9f24b832e6b0f436,0xdd9ca7d2df4d7c9, - 0xc6ede63fa05d3143,0x91503d1c79720dbb, - 0xf8a95fcf88747d94,0x75a44c6397ce912a, - 0x9b69dbe1b548ce7c,0xc986afbe3ee11aba, - 0xc24452da229b021b,0xfbe85badce996168, - 0xf2d56790ab41c2a2,0xfae27299423fb9c3, - 0x97c560ba6b0919a5,0xdccd879fc967d41a, - 0xbdb6b8e905cb600f,0x5400e987bbc1c920, - 0xed246723473e3813,0x290123e9aab23b68, - 0x9436c0760c86e30b,0xf9a0b6720aaf6521, - 0xb94470938fa89bce,0xf808e40e8d5b3e69, - 0xe7958cb87392c2c2,0xb60b1d1230b20e04, - 0x90bd77f3483bb9b9,0xb1c6f22b5e6f48c2, - 0xb4ecd5f01a4aa828,0x1e38aeb6360b1af3, - 0xe2280b6c20dd5232,0x25c6da63c38de1b0, - 0x8d590723948a535f,0x579c487e5a38ad0e, - 0xb0af48ec79ace837,0x2d835a9df0c6d851, - 0xdcdb1b2798182244,0xf8e431456cf88e65, - 0x8a08f0f8bf0f156b,0x1b8e9ecb641b58ff, - 0xac8b2d36eed2dac5,0xe272467e3d222f3f, - 0xd7adf884aa879177,0x5b0ed81dcc6abb0f, - 0x86ccbb52ea94baea,0x98e947129fc2b4e9, - 0xa87fea27a539e9a5,0x3f2398d747b36224, - 0xd29fe4b18e88640e,0x8eec7f0d19a03aad, - 0x83a3eeeef9153e89,0x1953cf68300424ac, - 0xa48ceaaab75a8e2b,0x5fa8c3423c052dd7, - 0xcdb02555653131b6,0x3792f412cb06794d, - 0x808e17555f3ebf11,0xe2bbd88bbee40bd0, - 0xa0b19d2ab70e6ed6,0x5b6aceaeae9d0ec4, - 0xc8de047564d20a8b,0xf245825a5a445275, - 0xfb158592be068d2e,0xeed6e2f0f0d56712, - 0x9ced737bb6c4183d,0x55464dd69685606b, - 0xc428d05aa4751e4c,0xaa97e14c3c26b886, - 0xf53304714d9265df,0xd53dd99f4b3066a8, - 0x993fe2c6d07b7fab,0xe546a8038efe4029, - 0xbf8fdb78849a5f96,0xde98520472bdd033, - 0xef73d256a5c0f77c,0x963e66858f6d4440, - 0x95a8637627989aad,0xdde7001379a44aa8, - 0xbb127c53b17ec159,0x5560c018580d5d52, - 0xe9d71b689dde71af,0xaab8f01e6e10b4a6, - 0x9226712162ab070d,0xcab3961304ca70e8, - 0xb6b00d69bb55c8d1,0x3d607b97c5fd0d22, - 0xe45c10c42a2b3b05,0x8cb89a7db77c506a, - 0x8eb98a7a9a5b04e3,0x77f3608e92adb242, - 0xb267ed1940f1c61c,0x55f038b237591ed3, - 0xdf01e85f912e37a3,0x6b6c46dec52f6688, - 0x8b61313bbabce2c6,0x2323ac4b3b3da015, - 0xae397d8aa96c1b77,0xabec975e0a0d081a, - 0xd9c7dced53c72255,0x96e7bd358c904a21, - 0x881cea14545c7575,0x7e50d64177da2e54, - 0xaa242499697392d2,0xdde50bd1d5d0b9e9, - 0xd4ad2dbfc3d07787,0x955e4ec64b44e864, - 0x84ec3c97da624ab4,0xbd5af13bef0b113e, - 0xa6274bbdd0fadd61,0xecb1ad8aeacdd58e, - 0xcfb11ead453994ba,0x67de18eda5814af2, - 0x81ceb32c4b43fcf4,0x80eacf948770ced7, - 0xa2425ff75e14fc31,0xa1258379a94d028d, - 0xcad2f7f5359a3b3e,0x96ee45813a04330, - 0xfd87b5f28300ca0d,0x8bca9d6e188853fc, - 0x9e74d1b791e07e48,0x775ea264cf55347e, - 0xc612062576589dda,0x95364afe032a819e, - 0xf79687aed3eec551,0x3a83ddbd83f52205, - 0x9abe14cd44753b52,0xc4926a9672793543, - 0xc16d9a0095928a27,0x75b7053c0f178294, - 0xf1c90080baf72cb1,0x5324c68b12dd6339, - 0x971da05074da7bee,0xd3f6fc16ebca5e04, - 0xbce5086492111aea,0x88f4bb1ca6bcf585, - 0xec1e4a7db69561a5,0x2b31e9e3d06c32e6, - 0x9392ee8e921d5d07,0x3aff322e62439fd0, - 0xb877aa3236a4b449,0x9befeb9fad487c3, - 0xe69594bec44de15b,0x4c2ebe687989a9b4, - 0x901d7cf73ab0acd9,0xf9d37014bf60a11, - 0xb424dc35095cd80f,0x538484c19ef38c95, - 0xe12e13424bb40e13,0x2865a5f206b06fba, - 0x8cbccc096f5088cb,0xf93f87b7442e45d4, - 0xafebff0bcb24aafe,0xf78f69a51539d749, - 0xdbe6fecebdedd5be,0xb573440e5a884d1c, - 0x89705f4136b4a597,0x31680a88f8953031, - 0xabcc77118461cefc,0xfdc20d2b36ba7c3e, - 0xd6bf94d5e57a42bc,0x3d32907604691b4d, - 0x8637bd05af6c69b5,0xa63f9a49c2c1b110, - 0xa7c5ac471b478423,0xfcf80dc33721d54, - 0xd1b71758e219652b,0xd3c36113404ea4a9, - 0x83126e978d4fdf3b,0x645a1cac083126ea, - 0xa3d70a3d70a3d70a,0x3d70a3d70a3d70a4, - 0xcccccccccccccccc,0xcccccccccccccccd, - 0x8000000000000000,0x0, - 0xa000000000000000,0x0, - 0xc800000000000000,0x0, - 0xfa00000000000000,0x0, - 0x9c40000000000000,0x0, - 0xc350000000000000,0x0, - 0xf424000000000000,0x0, - 0x9896800000000000,0x0, - 0xbebc200000000000,0x0, - 0xee6b280000000000,0x0, - 0x9502f90000000000,0x0, - 0xba43b74000000000,0x0, - 0xe8d4a51000000000,0x0, - 0x9184e72a00000000,0x0, - 0xb5e620f480000000,0x0, - 0xe35fa931a0000000,0x0, - 0x8e1bc9bf04000000,0x0, - 0xb1a2bc2ec5000000,0x0, - 0xde0b6b3a76400000,0x0, - 0x8ac7230489e80000,0x0, - 0xad78ebc5ac620000,0x0, - 0xd8d726b7177a8000,0x0, - 0x878678326eac9000,0x0, - 0xa968163f0a57b400,0x0, - 0xd3c21bcecceda100,0x0, - 0x84595161401484a0,0x0, - 0xa56fa5b99019a5c8,0x0, - 0xcecb8f27f4200f3a,0x0, - 0x813f3978f8940984,0x4000000000000000, - 0xa18f07d736b90be5,0x5000000000000000, - 0xc9f2c9cd04674ede,0xa400000000000000, - 0xfc6f7c4045812296,0x4d00000000000000, - 0x9dc5ada82b70b59d,0xf020000000000000, - 0xc5371912364ce305,0x6c28000000000000, - 0xf684df56c3e01bc6,0xc732000000000000, - 0x9a130b963a6c115c,0x3c7f400000000000, - 0xc097ce7bc90715b3,0x4b9f100000000000, - 0xf0bdc21abb48db20,0x1e86d40000000000, - 0x96769950b50d88f4,0x1314448000000000, - 0xbc143fa4e250eb31,0x17d955a000000000, - 0xeb194f8e1ae525fd,0x5dcfab0800000000, - 0x92efd1b8d0cf37be,0x5aa1cae500000000, - 0xb7abc627050305ad,0xf14a3d9e40000000, - 0xe596b7b0c643c719,0x6d9ccd05d0000000, - 0x8f7e32ce7bea5c6f,0xe4820023a2000000, - 0xb35dbf821ae4f38b,0xdda2802c8a800000, - 0xe0352f62a19e306e,0xd50b2037ad200000, - 0x8c213d9da502de45,0x4526f422cc340000, - 0xaf298d050e4395d6,0x9670b12b7f410000, - 0xdaf3f04651d47b4c,0x3c0cdd765f114000, - 0x88d8762bf324cd0f,0xa5880a69fb6ac800, - 0xab0e93b6efee0053,0x8eea0d047a457a00, - 0xd5d238a4abe98068,0x72a4904598d6d880, - 0x85a36366eb71f041,0x47a6da2b7f864750, - 0xa70c3c40a64e6c51,0x999090b65f67d924, - 0xd0cf4b50cfe20765,0xfff4b4e3f741cf6d, - 0x82818f1281ed449f,0xbff8f10e7a8921a4, - 0xa321f2d7226895c7,0xaff72d52192b6a0d, - 0xcbea6f8ceb02bb39,0x9bf4f8a69f764490, - 0xfee50b7025c36a08,0x2f236d04753d5b4, - 0x9f4f2726179a2245,0x1d762422c946590, - 0xc722f0ef9d80aad6,0x424d3ad2b7b97ef5, - 0xf8ebad2b84e0d58b,0xd2e0898765a7deb2, - 0x9b934c3b330c8577,0x63cc55f49f88eb2f, - 0xc2781f49ffcfa6d5,0x3cbf6b71c76b25fb, - 0xf316271c7fc3908a,0x8bef464e3945ef7a, - 0x97edd871cfda3a56,0x97758bf0e3cbb5ac, - 0xbde94e8e43d0c8ec,0x3d52eeed1cbea317, - 0xed63a231d4c4fb27,0x4ca7aaa863ee4bdd, - 0x945e455f24fb1cf8,0x8fe8caa93e74ef6a, - 0xb975d6b6ee39e436,0xb3e2fd538e122b44, - 0xe7d34c64a9c85d44,0x60dbbca87196b616, - 0x90e40fbeea1d3a4a,0xbc8955e946fe31cd, - 0xb51d13aea4a488dd,0x6babab6398bdbe41, - 0xe264589a4dcdab14,0xc696963c7eed2dd1, - 0x8d7eb76070a08aec,0xfc1e1de5cf543ca2, - 0xb0de65388cc8ada8,0x3b25a55f43294bcb, - 0xdd15fe86affad912,0x49ef0eb713f39ebe, - 0x8a2dbf142dfcc7ab,0x6e3569326c784337, - 0xacb92ed9397bf996,0x49c2c37f07965404, - 0xd7e77a8f87daf7fb,0xdc33745ec97be906, - 0x86f0ac99b4e8dafd,0x69a028bb3ded71a3, - 0xa8acd7c0222311bc,0xc40832ea0d68ce0c, - 0xd2d80db02aabd62b,0xf50a3fa490c30190, - 0x83c7088e1aab65db,0x792667c6da79e0fa, - 0xa4b8cab1a1563f52,0x577001b891185938, - 0xcde6fd5e09abcf26,0xed4c0226b55e6f86, - 0x80b05e5ac60b6178,0x544f8158315b05b4, - 0xa0dc75f1778e39d6,0x696361ae3db1c721, - 0xc913936dd571c84c,0x3bc3a19cd1e38e9, - 0xfb5878494ace3a5f,0x4ab48a04065c723, - 0x9d174b2dcec0e47b,0x62eb0d64283f9c76, - 0xc45d1df942711d9a,0x3ba5d0bd324f8394, - 0xf5746577930d6500,0xca8f44ec7ee36479, - 0x9968bf6abbe85f20,0x7e998b13cf4e1ecb, - 0xbfc2ef456ae276e8,0x9e3fedd8c321a67e, - 0xefb3ab16c59b14a2,0xc5cfe94ef3ea101e, - 0x95d04aee3b80ece5,0xbba1f1d158724a12, - 0xbb445da9ca61281f,0x2a8a6e45ae8edc97, - 0xea1575143cf97226,0xf52d09d71a3293bd, - 0x924d692ca61be758,0x593c2626705f9c56, - 0xb6e0c377cfa2e12e,0x6f8b2fb00c77836c, - 0xe498f455c38b997a,0xb6dfb9c0f956447, - 0x8edf98b59a373fec,0x4724bd4189bd5eac, - 0xb2977ee300c50fe7,0x58edec91ec2cb657, - 0xdf3d5e9bc0f653e1,0x2f2967b66737e3ed, - 0x8b865b215899f46c,0xbd79e0d20082ee74, - 0xae67f1e9aec07187,0xecd8590680a3aa11, - 0xda01ee641a708de9,0xe80e6f4820cc9495, - 0x884134fe908658b2,0x3109058d147fdcdd, - 0xaa51823e34a7eede,0xbd4b46f0599fd415, - 0xd4e5e2cdc1d1ea96,0x6c9e18ac7007c91a, - 0x850fadc09923329e,0x3e2cf6bc604ddb0, - 0xa6539930bf6bff45,0x84db8346b786151c, - 0xcfe87f7cef46ff16,0xe612641865679a63, - 0x81f14fae158c5f6e,0x4fcb7e8f3f60c07e, - 0xa26da3999aef7749,0xe3be5e330f38f09d, - 0xcb090c8001ab551c,0x5cadf5bfd3072cc5, - 0xfdcb4fa002162a63,0x73d9732fc7c8f7f6, - 0x9e9f11c4014dda7e,0x2867e7fddcdd9afa, - 0xc646d63501a1511d,0xb281e1fd541501b8, - 0xf7d88bc24209a565,0x1f225a7ca91a4226, - 0x9ae757596946075f,0x3375788de9b06958, - 0xc1a12d2fc3978937,0x52d6b1641c83ae, - 0xf209787bb47d6b84,0xc0678c5dbd23a49a, - 0x9745eb4d50ce6332,0xf840b7ba963646e0, - 0xbd176620a501fbff,0xb650e5a93bc3d898, - 0xec5d3fa8ce427aff,0xa3e51f138ab4cebe, - 0x93ba47c980e98cdf,0xc66f336c36b10137, - 0xb8a8d9bbe123f017,0xb80b0047445d4184, - 0xe6d3102ad96cec1d,0xa60dc059157491e5, - 0x9043ea1ac7e41392,0x87c89837ad68db2f, - 0xb454e4a179dd1877,0x29babe4598c311fb, - 0xe16a1dc9d8545e94,0xf4296dd6fef3d67a, - 0x8ce2529e2734bb1d,0x1899e4a65f58660c, - 0xb01ae745b101e9e4,0x5ec05dcff72e7f8f, - 0xdc21a1171d42645d,0x76707543f4fa1f73, - 0x899504ae72497eba,0x6a06494a791c53a8, - 0xabfa45da0edbde69,0x487db9d17636892, - 0xd6f8d7509292d603,0x45a9d2845d3c42b6, - 0x865b86925b9bc5c2,0xb8a2392ba45a9b2, - 0xa7f26836f282b732,0x8e6cac7768d7141e, - 0xd1ef0244af2364ff,0x3207d795430cd926, - 0x8335616aed761f1f,0x7f44e6bd49e807b8, - 0xa402b9c5a8d3a6e7,0x5f16206c9c6209a6, - 0xcd036837130890a1,0x36dba887c37a8c0f, - 0x802221226be55a64,0xc2494954da2c9789, - 0xa02aa96b06deb0fd,0xf2db9baa10b7bd6c, - 0xc83553c5c8965d3d,0x6f92829494e5acc7, - 0xfa42a8b73abbf48c,0xcb772339ba1f17f9, - 0x9c69a97284b578d7,0xff2a760414536efb, - 0xc38413cf25e2d70d,0xfef5138519684aba, - 0xf46518c2ef5b8cd1,0x7eb258665fc25d69, - 0x98bf2f79d5993802,0xef2f773ffbd97a61, - 0xbeeefb584aff8603,0xaafb550ffacfd8fa, - 0xeeaaba2e5dbf6784,0x95ba2a53f983cf38, - 0x952ab45cfa97a0b2,0xdd945a747bf26183, - 0xba756174393d88df,0x94f971119aeef9e4, - 0xe912b9d1478ceb17,0x7a37cd5601aab85d, - 0x91abb422ccb812ee,0xac62e055c10ab33a, - 0xb616a12b7fe617aa,0x577b986b314d6009, - 0xe39c49765fdf9d94,0xed5a7e85fda0b80b, - 0x8e41ade9fbebc27d,0x14588f13be847307, - 0xb1d219647ae6b31c,0x596eb2d8ae258fc8, - 0xde469fbd99a05fe3,0x6fca5f8ed9aef3bb, - 0x8aec23d680043bee,0x25de7bb9480d5854, - 0xada72ccc20054ae9,0xaf561aa79a10ae6a, - 0xd910f7ff28069da4,0x1b2ba1518094da04, - 0x87aa9aff79042286,0x90fb44d2f05d0842, - 0xa99541bf57452b28,0x353a1607ac744a53, - 0xd3fa922f2d1675f2,0x42889b8997915ce8, - 0x847c9b5d7c2e09b7,0x69956135febada11, - 0xa59bc234db398c25,0x43fab9837e699095, - 0xcf02b2c21207ef2e,0x94f967e45e03f4bb, - 0x8161afb94b44f57d,0x1d1be0eebac278f5, - 0xa1ba1ba79e1632dc,0x6462d92a69731732, - 0xca28a291859bbf93,0x7d7b8f7503cfdcfe, - 0xfcb2cb35e702af78,0x5cda735244c3d43e, - 0x9defbf01b061adab,0x3a0888136afa64a7, - 0xc56baec21c7a1916,0x88aaa1845b8fdd0, - 0xf6c69a72a3989f5b,0x8aad549e57273d45, - 0x9a3c2087a63f6399,0x36ac54e2f678864b, - 0xc0cb28a98fcf3c7f,0x84576a1bb416a7dd, - 0xf0fdf2d3f3c30b9f,0x656d44a2a11c51d5, - 0x969eb7c47859e743,0x9f644ae5a4b1b325, - 0xbc4665b596706114,0x873d5d9f0dde1fee, - 0xeb57ff22fc0c7959,0xa90cb506d155a7ea, - 0x9316ff75dd87cbd8,0x9a7f12442d588f2, - 0xb7dcbf5354e9bece,0xc11ed6d538aeb2f, - 0xe5d3ef282a242e81,0x8f1668c8a86da5fa, - 0x8fa475791a569d10,0xf96e017d694487bc, - 0xb38d92d760ec4455,0x37c981dcc395a9ac, - 0xe070f78d3927556a,0x85bbe253f47b1417, - 0x8c469ab843b89562,0x93956d7478ccec8e, - 0xaf58416654a6babb,0x387ac8d1970027b2, - 0xdb2e51bfe9d0696a,0x6997b05fcc0319e, - 0x88fcf317f22241e2,0x441fece3bdf81f03, - 0xab3c2fddeeaad25a,0xd527e81cad7626c3, - 0xd60b3bd56a5586f1,0x8a71e223d8d3b074, - 0x85c7056562757456,0xf6872d5667844e49, - 0xa738c6bebb12d16c,0xb428f8ac016561db, - 0xd106f86e69d785c7,0xe13336d701beba52, - 0x82a45b450226b39c,0xecc0024661173473, - 0xa34d721642b06084,0x27f002d7f95d0190, - 0xcc20ce9bd35c78a5,0x31ec038df7b441f4, - 0xff290242c83396ce,0x7e67047175a15271, - 0x9f79a169bd203e41,0xf0062c6e984d386, - 0xc75809c42c684dd1,0x52c07b78a3e60868, - 0xf92e0c3537826145,0xa7709a56ccdf8a82, - 0x9bbcc7a142b17ccb,0x88a66076400bb691, - 0xc2abf989935ddbfe,0x6acff893d00ea435, - 0xf356f7ebf83552fe,0x583f6b8c4124d43, - 0x98165af37b2153de,0xc3727a337a8b704a, - 0xbe1bf1b059e9a8d6,0x744f18c0592e4c5c, - 0xeda2ee1c7064130c,0x1162def06f79df73, - 0x9485d4d1c63e8be7,0x8addcb5645ac2ba8, - 0xb9a74a0637ce2ee1,0x6d953e2bd7173692, - 0xe8111c87c5c1ba99,0xc8fa8db6ccdd0437, - 0x910ab1d4db9914a0,0x1d9c9892400a22a2, - 0xb54d5e4a127f59c8,0x2503beb6d00cab4b, - 0xe2a0b5dc971f303a,0x2e44ae64840fd61d, - 0x8da471a9de737e24,0x5ceaecfed289e5d2, - 0xb10d8e1456105dad,0x7425a83e872c5f47, - 0xdd50f1996b947518,0xd12f124e28f77719, - 0x8a5296ffe33cc92f,0x82bd6b70d99aaa6f, - 0xace73cbfdc0bfb7b,0x636cc64d1001550b, - 0xd8210befd30efa5a,0x3c47f7e05401aa4e, - 0x8714a775e3e95c78,0x65acfaec34810a71, - 0xa8d9d1535ce3b396,0x7f1839a741a14d0d, - 0xd31045a8341ca07c,0x1ede48111209a050, - 0x83ea2b892091e44d,0x934aed0aab460432, - 0xa4e4b66b68b65d60,0xf81da84d5617853f, - 0xce1de40642e3f4b9,0x36251260ab9d668e, - 0x80d2ae83e9ce78f3,0xc1d72b7c6b426019, - 0xa1075a24e4421730,0xb24cf65b8612f81f, - 0xc94930ae1d529cfc,0xdee033f26797b627, - 0xfb9b7cd9a4a7443c,0x169840ef017da3b1, - 0x9d412e0806e88aa5,0x8e1f289560ee864e, - 0xc491798a08a2ad4e,0xf1a6f2bab92a27e2, - 0xf5b5d7ec8acb58a2,0xae10af696774b1db, - 0x9991a6f3d6bf1765,0xacca6da1e0a8ef29, - 0xbff610b0cc6edd3f,0x17fd090a58d32af3, - 0xeff394dcff8a948e,0xddfc4b4cef07f5b0, - 0x95f83d0a1fb69cd9,0x4abdaf101564f98e, - 0xbb764c4ca7a4440f,0x9d6d1ad41abe37f1, - 0xea53df5fd18d5513,0x84c86189216dc5ed, - 0x92746b9be2f8552c,0x32fd3cf5b4e49bb4, - 0xb7118682dbb66a77,0x3fbc8c33221dc2a1, - 0xe4d5e82392a40515,0xfabaf3feaa5334a, - 0x8f05b1163ba6832d,0x29cb4d87f2a7400e, - 0xb2c71d5bca9023f8,0x743e20e9ef511012, - 0xdf78e4b2bd342cf6,0x914da9246b255416, - 0x8bab8eefb6409c1a,0x1ad089b6c2f7548e, - 0xae9672aba3d0c320,0xa184ac2473b529b1, - 0xda3c0f568cc4f3e8,0xc9e5d72d90a2741e, - 0x8865899617fb1871,0x7e2fa67c7a658892, - 0xaa7eebfb9df9de8d,0xddbb901b98feeab7, - 0xd51ea6fa85785631,0x552a74227f3ea565, - 0x8533285c936b35de,0xd53a88958f87275f, - 0xa67ff273b8460356,0x8a892abaf368f137, - 0xd01fef10a657842c,0x2d2b7569b0432d85, - 0x8213f56a67f6b29b,0x9c3b29620e29fc73, - 0xa298f2c501f45f42,0x8349f3ba91b47b8f, - 0xcb3f2f7642717713,0x241c70a936219a73, - 0xfe0efb53d30dd4d7,0xed238cd383aa0110, - 0x9ec95d1463e8a506,0xf4363804324a40aa, - 0xc67bb4597ce2ce48,0xb143c6053edcd0d5, - 0xf81aa16fdc1b81da,0xdd94b7868e94050a, - 0x9b10a4e5e9913128,0xca7cf2b4191c8326, - 0xc1d4ce1f63f57d72,0xfd1c2f611f63a3f0, - 0xf24a01a73cf2dccf,0xbc633b39673c8cec, - 0x976e41088617ca01,0xd5be0503e085d813, - 0xbd49d14aa79dbc82,0x4b2d8644d8a74e18, - 0xec9c459d51852ba2,0xddf8e7d60ed1219e, - 0x93e1ab8252f33b45,0xcabb90e5c942b503, - 0xb8da1662e7b00a17,0x3d6a751f3b936243, - 0xe7109bfba19c0c9d,0xcc512670a783ad4, - 0x906a617d450187e2,0x27fb2b80668b24c5, - 0xb484f9dc9641e9da,0xb1f9f660802dedf6, - 0xe1a63853bbd26451,0x5e7873f8a0396973, - 0x8d07e33455637eb2,0xdb0b487b6423e1e8, - 0xb049dc016abc5e5f,0x91ce1a9a3d2cda62, - 0xdc5c5301c56b75f7,0x7641a140cc7810fb, - 0x89b9b3e11b6329ba,0xa9e904c87fcb0a9d, - 0xac2820d9623bf429,0x546345fa9fbdcd44, - 0xd732290fbacaf133,0xa97c177947ad4095, - 0x867f59a9d4bed6c0,0x49ed8eabcccc485d, - 0xa81f301449ee8c70,0x5c68f256bfff5a74, - 0xd226fc195c6a2f8c,0x73832eec6fff3111, - 0x83585d8fd9c25db7,0xc831fd53c5ff7eab, - 0xa42e74f3d032f525,0xba3e7ca8b77f5e55, - 0xcd3a1230c43fb26f,0x28ce1bd2e55f35eb, - 0x80444b5e7aa7cf85,0x7980d163cf5b81b3, - 0xa0555e361951c366,0xd7e105bcc332621f, - 0xc86ab5c39fa63440,0x8dd9472bf3fefaa7, - 0xfa856334878fc150,0xb14f98f6f0feb951, - 0x9c935e00d4b9d8d2,0x6ed1bf9a569f33d3, - 0xc3b8358109e84f07,0xa862f80ec4700c8, - 0xf4a642e14c6262c8,0xcd27bb612758c0fa, - 0x98e7e9cccfbd7dbd,0x8038d51cb897789c, - 0xbf21e44003acdd2c,0xe0470a63e6bd56c3, - 0xeeea5d5004981478,0x1858ccfce06cac74, - 0x95527a5202df0ccb,0xf37801e0c43ebc8, - 0xbaa718e68396cffd,0xd30560258f54e6ba, - 0xe950df20247c83fd,0x47c6b82ef32a2069, - 0x91d28b7416cdd27e,0x4cdc331d57fa5441, - 0xb6472e511c81471d,0xe0133fe4adf8e952, - 0xe3d8f9e563a198e5,0x58180fddd97723a6, - 0x8e679c2f5e44ff8f,0x570f09eaa7ea7648,}; +constexpr uint64_t powers_template::power_of_five_128[number_of_entries]; + using powers = powers_template<>; -} +} // namespace fast_float } // namespace arrow_vendored #endif diff --git a/cpp/src/arrow/vendored/fast_float/float_common.h b/cpp/src/arrow/vendored/fast_float/float_common.h index 0d6bfe7efb88b..717320126750c 100644 --- a/cpp/src/arrow/vendored/fast_float/float_common.h +++ b/cpp/src/arrow/vendored/fast_float/float_common.h @@ -7,6 +7,25 @@ #include #include +#ifdef __has_include +#if __has_include() +#include +#endif +#endif + +#if __cpp_lib_bit_cast >= 201806L +#include +#define FASTFLOAT_HAS_BIT_CAST 1 +#else +#define FASTFLOAT_HAS_BIT_CAST 0 +#endif + +#if __cpp_lib_is_constant_evaluated >= 201811L +#define FASTFLOAT_HAS_IS_CONSTANT_EVALUATED 1 +#else +#define FASTFLOAT_HAS_IS_CONSTANT_EVALUATED 0 +#endif + #if (defined(__x86_64) || defined(__x86_64__) || defined(_M_X64) \ || defined(__amd64) || defined(__aarch64__) || defined(_M_ARM64) \ || defined(__MINGW64__) \ @@ -14,7 +33,7 @@ || (defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || defined(__PPC64LE__)) ) #define FASTFLOAT_64BIT 1 #elif (defined(__i386) || defined(__i386__) || defined(_M_IX86) \ - || defined(__arm__) || defined(_M_ARM) \ + || defined(__arm__) || defined(_M_ARM) || defined(__ppc__) \ || defined(__MINGW32__) || defined(__EMSCRIPTEN__)) #define FASTFLOAT_32BIT 1 #else @@ -50,7 +69,11 @@ #elif defined(sun) || defined(__sun) #include #else +#ifdef __has_include +#if __has_include() #include +#endif //__has_include() +#endif //__has_include #endif # #ifndef __BYTE_ORDER__ @@ -77,23 +100,46 @@ #endif #ifndef FASTFLOAT_ASSERT -#define FASTFLOAT_ASSERT(x) { if (!(x)) abort(); } +#define FASTFLOAT_ASSERT(x) { ((void)(x)); } #endif #ifndef FASTFLOAT_DEBUG_ASSERT -#include -#define FASTFLOAT_DEBUG_ASSERT(x) assert(x) +#define FASTFLOAT_DEBUG_ASSERT(x) { ((void)(x)); } #endif // rust style `try!()` macro, or `?` operator #define FASTFLOAT_TRY(x) { if (!(x)) return false; } +// Testing for https://wg21.link/N3652, adopted in C++14 +#if __cpp_constexpr >= 201304 +#define FASTFLOAT_CONSTEXPR14 constexpr +#else +#define FASTFLOAT_CONSTEXPR14 +#endif + +// Testing for relevant C++20 constexpr library features +#if FASTFLOAT_HAS_IS_CONSTANT_EVALUATED \ + && FASTFLOAT_HAS_BIT_CAST \ + && __cpp_lib_constexpr_algorithms >= 201806L /*For std::copy and std::fill*/ +#define FASTFLOAT_CONSTEXPR20 constexpr +#else +#define FASTFLOAT_CONSTEXPR20 +#endif + namespace arrow_vendored { namespace fast_float { +fastfloat_really_inline constexpr bool cpp20_and_in_constexpr() { +#if FASTFLOAT_HAS_IS_CONSTANT_EVALUATED + return std::is_constant_evaluated(); +#else + return false; +#endif +} + // Compares two ASCII strings in a case insensitive manner. -inline bool fastfloat_strncasecmp(const char *input1, const char *input2, - size_t length) { +inline FASTFLOAT_CONSTEXPR14 bool +fastfloat_strncasecmp(const char *input1, const char *input2, size_t length) { char running_diff{0}; for (size_t i = 0; i < length; i++) { running_diff |= (input1[i] ^ input2[i]); @@ -110,14 +156,14 @@ template struct span { const T* ptr; size_t length; - span(const T* _ptr, size_t _length) : ptr(_ptr), length(_length) {} - span() : ptr(nullptr), length(0) {} + constexpr span(const T* _ptr, size_t _length) : ptr(_ptr), length(_length) {} + constexpr span() : ptr(nullptr), length(0) {} constexpr size_t len() const noexcept { return length; } - const T& operator[](size_t index) const noexcept { + FASTFLOAT_CONSTEXPR14 const T& operator[](size_t index) const noexcept { FASTFLOAT_DEBUG_ASSERT(index < length); return ptr[index]; } @@ -126,13 +172,31 @@ struct span { struct value128 { uint64_t low; uint64_t high; - value128(uint64_t _low, uint64_t _high) : low(_low), high(_high) {} - value128() : low(0), high(0) {} + constexpr value128(uint64_t _low, uint64_t _high) : low(_low), high(_high) {} + constexpr value128() : low(0), high(0) {} }; +/* Helper C++11 constexpr generic implementation of leading_zeroes */ +fastfloat_really_inline constexpr +int leading_zeroes_generic(uint64_t input_num, int last_bit = 0) { + return ( + ((input_num & uint64_t(0xffffffff00000000)) && (input_num >>= 32, last_bit |= 32)), + ((input_num & uint64_t( 0xffff0000)) && (input_num >>= 16, last_bit |= 16)), + ((input_num & uint64_t( 0xff00)) && (input_num >>= 8, last_bit |= 8)), + ((input_num & uint64_t( 0xf0)) && (input_num >>= 4, last_bit |= 4)), + ((input_num & uint64_t( 0xc)) && (input_num >>= 2, last_bit |= 2)), + ((input_num & uint64_t( 0x2)) && (input_num >>= 1, last_bit |= 1)), + 63 - last_bit + ); +} + /* result might be undefined when input_num is zero */ -fastfloat_really_inline int leading_zeroes(uint64_t input_num) { +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 +int leading_zeroes(uint64_t input_num) { assert(input_num > 0); + if (cpp20_and_in_constexpr()) { + return leading_zeroes_generic(input_num); + } #ifdef FASTFLOAT_VISUAL_STUDIO #if defined(_M_X64) || defined(_M_ARM64) unsigned long leading_zero = 0; @@ -141,31 +205,20 @@ fastfloat_really_inline int leading_zeroes(uint64_t input_num) { _BitScanReverse64(&leading_zero, input_num); return (int)(63 - leading_zero); #else - int last_bit = 0; - if(input_num & uint64_t(0xffffffff00000000)) input_num >>= 32, last_bit |= 32; - if(input_num & uint64_t( 0xffff0000)) input_num >>= 16, last_bit |= 16; - if(input_num & uint64_t( 0xff00)) input_num >>= 8, last_bit |= 8; - if(input_num & uint64_t( 0xf0)) input_num >>= 4, last_bit |= 4; - if(input_num & uint64_t( 0xc)) input_num >>= 2, last_bit |= 2; - if(input_num & uint64_t( 0x2)) input_num >>= 1, last_bit |= 1; - return 63 - last_bit; + return leading_zeroes_generic(input_num); #endif #else return __builtin_clzll(input_num); #endif } -#ifdef FASTFLOAT_32BIT - // slow emulation routine for 32-bit -fastfloat_really_inline uint64_t emulu(uint32_t x, uint32_t y) { +fastfloat_really_inline constexpr uint64_t emulu(uint32_t x, uint32_t y) { return x * (uint64_t)y; } -// slow emulation routine for 32-bit -#if !defined(__MINGW64__) -fastfloat_really_inline uint64_t _umul128(uint64_t ab, uint64_t cd, - uint64_t *hi) { +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 +uint64_t umul128_generic(uint64_t ab, uint64_t cd, uint64_t *hi) { uint64_t ad = emulu((uint32_t)(ab >> 32), (uint32_t)cd); uint64_t bd = emulu((uint32_t)ab, (uint32_t)cd); uint64_t adbc = ad + emulu((uint32_t)ab, (uint32_t)(cd >> 32)); @@ -175,14 +228,28 @@ fastfloat_really_inline uint64_t _umul128(uint64_t ab, uint64_t cd, (adbc_carry << 32) + !!(lo < bd); return lo; } + +#ifdef FASTFLOAT_32BIT + +// slow emulation routine for 32-bit +#if !defined(__MINGW64__) +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 +uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) { + return umul128_generic(ab, cd, hi); +} #endif // !__MINGW64__ #endif // FASTFLOAT_32BIT // compute 64-bit a*b -fastfloat_really_inline value128 full_multiplication(uint64_t a, - uint64_t b) { +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 +value128 full_multiplication(uint64_t a, uint64_t b) { + if (cpp20_and_in_constexpr()) { + value128 answer; + answer.low = umul128_generic(a, b, &answer.high); + return answer; + } value128 answer; #if defined(_M_ARM64) && !defined(__MINGW32__) // ARM64 has native support for 64-bit multiplications, no need to emulate @@ -196,7 +263,7 @@ fastfloat_really_inline value128 full_multiplication(uint64_t a, answer.low = uint64_t(r); answer.high = uint64_t(r >> 64); #else - #error Not implemented + answer.low = umul128_generic(a, b, &answer.high); #endif return answer; } @@ -205,10 +272,10 @@ struct adjusted_mantissa { uint64_t mantissa{0}; int32_t power2{0}; // a negative value indicates an invalid result adjusted_mantissa() = default; - bool operator==(const adjusted_mantissa &o) const { + constexpr bool operator==(const adjusted_mantissa &o) const { return mantissa == o.mantissa && power2 == o.power2; } - bool operator!=(const adjusted_mantissa &o) const { + constexpr bool operator!=(const adjusted_mantissa &o) const { return mantissa != o.mantissa || power2 != o.power2; } }; @@ -219,8 +286,8 @@ constexpr static int32_t invalid_am_bias = -0x8000; constexpr static double powers_of_ten_double[] = { 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, 1e20, 1e21, 1e22}; -constexpr static float powers_of_ten_float[] = {1e0, 1e1, 1e2, 1e3, 1e4, 1e5, - 1e6, 1e7, 1e8, 1e9, 1e10}; +constexpr static float powers_of_ten_float[] = {1e0f, 1e1f, 1e2f, 1e3f, 1e4f, 1e5f, + 1e6f, 1e7f, 1e8f, 1e9f, 1e10f}; // used for max_mantissa_double and max_mantissa_float constexpr uint64_t constant_55555 = 5 * 5 * 5 * 5 * 5; // Largest integer value v so that (5**index * v) <= 1<<53. @@ -433,23 +500,41 @@ template <> inline constexpr binary_format::equiv_uint } template -fastfloat_really_inline void to_float(bool negative, adjusted_mantissa am, T &value) { - uint64_t word = am.mantissa; - word |= uint64_t(am.power2) << binary_format::mantissa_explicit_bits(); - word = negative - ? word | (uint64_t(1) << binary_format::sign_index()) : word; -#if FASTFLOAT_IS_BIG_ENDIAN == 1 - if (std::is_same::value) { - ::memcpy(&value, (char *)&word + 4, sizeof(T)); // extract value at offset 4-7 if float on big-endian - } else { - ::memcpy(&value, &word, sizeof(T)); - } +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 +void to_float(bool negative, adjusted_mantissa am, T &value) { + using uint = typename binary_format::equiv_uint; + uint word = (uint)am.mantissa; + word |= uint(am.power2) << binary_format::mantissa_explicit_bits(); + word |= uint(negative) << binary_format::sign_index(); +#if FASTFLOAT_HAS_BIT_CAST + value = std::bit_cast(word); #else - // For little-endian systems: - ::memcpy(&value, &word, sizeof(T)); + ::memcpy(&value, &word, sizeof(T)); #endif } +#if FASTFLOAT_SKIP_WHITE_SPACE // disabled by default +template +struct space_lut { + static constexpr bool value[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; +}; + +template +constexpr bool space_lut::value[]; + +inline constexpr bool is_space(uint8_t c) { return space_lut<>::value[c]; } +#endif } // namespace fast_float } // namespace arrow_vendored diff --git a/cpp/src/arrow/vendored/fast_float/parse_number.h b/cpp/src/arrow/vendored/fast_float/parse_number.h index e1c9603aeaa94..905d614c9db29 100644 --- a/cpp/src/arrow/vendored/fast_float/parse_number.h +++ b/cpp/src/arrow/vendored/fast_float/parse_number.h @@ -30,6 +30,11 @@ from_chars_result parse_infnan(const char *first, const char *last, T &value) n minusSign = true; ++first; } +#if FASTFLOAT_ALLOWS_LEADING_PLUS // disabled by default + if (*first == '+') { + ++first; + } +#endif if (last - first >= 3) { if (fastfloat_strncasecmp(first, "nan", 3)) { answer.ptr = (first += 3); @@ -67,6 +72,10 @@ from_chars_result parse_infnan(const char *first, const char *last, T &value) n * Credit : @mwalcott3 */ fastfloat_really_inline bool rounds_to_nearest() noexcept { + // https://lemire.me/blog/2020/06/26/gcc-not-nearest/ +#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0) + return false; +#endif // See // A fast function to check your floating-point rounding mode // https://lemire.me/blog/2022/11/16/a-fast-function-to-check-your-floating-point-rounding-mode/ @@ -100,7 +109,25 @@ fastfloat_really_inline bool rounds_to_nearest() noexcept { // // Note: This may fail to be accurate if fast-math has been // enabled, as rounding conventions may not apply. + #if FASTFLOAT_VISUAL_STUDIO + # pragma warning(push) + // todo: is there a VS warning? + // see https://stackoverflow.com/questions/46079446/is-there-a-warning-for-floating-point-equality-checking-in-visual-studio-2013 + #elif defined(__clang__) + # pragma clang diagnostic push + # pragma clang diagnostic ignored "-Wfloat-equal" + #elif defined(__GNUC__) + # pragma GCC diagnostic push + # pragma GCC diagnostic ignored "-Wfloat-equal" + #endif return (fmini + 1.0f == 1.0f - fmini); + #if FASTFLOAT_VISUAL_STUDIO + # pragma warning(pop) + #elif defined(__clang__) + # pragma clang diagnostic pop + #elif defined(__GNUC__) + # pragma GCC diagnostic pop + #endif } } // namespace detail @@ -119,6 +146,11 @@ from_chars_result from_chars_advanced(const char *first, const char *last, from_chars_result answer; +#if FASTFLOAT_SKIP_WHITE_SPACE // disabled by default + while ((first != last) && fast_float::is_space(uint8_t(*first))) { + first++; + } +#endif if (first == last) { answer.ec = std::errc::invalid_argument; answer.ptr = first; diff --git a/cpp/src/arrow/vendored/fast_float/update.sh b/cpp/src/arrow/vendored/fast_float/update.sh index ab6e9515da5d8..f0e2d3dc508c5 100755 --- a/cpp/src/arrow/vendored/fast_float/update.sh +++ b/cpp/src/arrow/vendored/fast_float/update.sh @@ -23,7 +23,7 @@ source_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" if [ "$#" -ne 1 ]; then echo "Usage: $0 VERSION" - echo " e.g.: $0 3.8.1" + echo " e.g.: $0 3.10.1" exit 1 fi From b9a647023de40a670ec8d9c32ff6e205debfaa8d Mon Sep 17 00:00:00 2001 From: abandy Date: Wed, 19 Jul 2023 19:55:47 -0400 Subject: [PATCH 016/749] GH-36546: [Swift] The initial implementation for swift arrow flight (#36547) The initial implementation for swift arrow flight. This change depends on PR GH-36544. * Closes: #36546 Authored-by: Alva Bandy Signed-off-by: Sutou Kouhei --- ci/scripts/swift_test.sh | 5 + swift/ArrowFlight/.gitignore | 9 + swift/ArrowFlight/Package.swift | 53 + .../Sources/ArrowFlight/Flight.grpc.swift | 1343 ++++++++++++++++ .../Sources/ArrowFlight/Flight.pb.swift | 1366 +++++++++++++++++ .../Sources/ArrowFlight/FlightAction.swift | 39 + .../ArrowFlight/FlightActionType.swift | 38 + .../Sources/ArrowFlight/FlightClient.swift | 144 ++ .../Sources/ArrowFlight/FlightCriteria.swift | 37 + .../Sources/ArrowFlight/FlightData.swift | 46 + .../ArrowFlight/FlightDescriptor.swift | 56 + .../Sources/ArrowFlight/FlightEndpoint.swift | 38 + .../Sources/ArrowFlight/FlightInfo.swift | 55 + .../Sources/ArrowFlight/FlightLocation.swift | 36 + .../Sources/ArrowFlight/FlightPutResult.swift | 35 + .../Sources/ArrowFlight/FlightResult.swift | 35 + .../ArrowFlight/FlightSchemaResult.swift | 37 + .../Sources/ArrowFlight/FlightServer.swift | 162 ++ .../Sources/ArrowFlight/FlightTicket.swift | 35 + .../ArrowFlight/RecordBatchStreamReader.swift | 67 + .../ArrowFlight/RecordBatchStreamWriter.swift | 91 ++ .../Tests/ArrowFlightTests/FlightTest.swift | 302 ++++ swift/gen-protobuffers.sh | 44 + 23 files changed, 4073 insertions(+) create mode 100644 swift/ArrowFlight/.gitignore create mode 100644 swift/ArrowFlight/Package.swift create mode 100644 swift/ArrowFlight/Sources/ArrowFlight/Flight.grpc.swift create mode 100644 swift/ArrowFlight/Sources/ArrowFlight/Flight.pb.swift create mode 100644 swift/ArrowFlight/Sources/ArrowFlight/FlightAction.swift create mode 100644 swift/ArrowFlight/Sources/ArrowFlight/FlightActionType.swift create mode 100644 swift/ArrowFlight/Sources/ArrowFlight/FlightClient.swift create mode 100644 swift/ArrowFlight/Sources/ArrowFlight/FlightCriteria.swift create mode 100644 swift/ArrowFlight/Sources/ArrowFlight/FlightData.swift create mode 100644 swift/ArrowFlight/Sources/ArrowFlight/FlightDescriptor.swift create mode 100644 swift/ArrowFlight/Sources/ArrowFlight/FlightEndpoint.swift create mode 100644 swift/ArrowFlight/Sources/ArrowFlight/FlightInfo.swift create mode 100644 swift/ArrowFlight/Sources/ArrowFlight/FlightLocation.swift create mode 100644 swift/ArrowFlight/Sources/ArrowFlight/FlightPutResult.swift create mode 100644 swift/ArrowFlight/Sources/ArrowFlight/FlightResult.swift create mode 100644 swift/ArrowFlight/Sources/ArrowFlight/FlightSchemaResult.swift create mode 100644 swift/ArrowFlight/Sources/ArrowFlight/FlightServer.swift create mode 100644 swift/ArrowFlight/Sources/ArrowFlight/FlightTicket.swift create mode 100644 swift/ArrowFlight/Sources/ArrowFlight/RecordBatchStreamReader.swift create mode 100644 swift/ArrowFlight/Sources/ArrowFlight/RecordBatchStreamWriter.swift create mode 100644 swift/ArrowFlight/Tests/ArrowFlightTests/FlightTest.swift create mode 100755 swift/gen-protobuffers.sh diff --git a/ci/scripts/swift_test.sh b/ci/scripts/swift_test.sh index eac13c5d68ef4..b7ab37fd489c9 100755 --- a/ci/scripts/swift_test.sh +++ b/ci/scripts/swift_test.sh @@ -30,3 +30,8 @@ source_dir=${1}/swift/Arrow pushd ${source_dir} swift test popd + +source_dir=${1}/swift/ArrowFlight +pushd ${source_dir} +swift test +popd diff --git a/swift/ArrowFlight/.gitignore b/swift/ArrowFlight/.gitignore new file mode 100644 index 0000000000000..d561187385c2d --- /dev/null +++ b/swift/ArrowFlight/.gitignore @@ -0,0 +1,9 @@ +.DS_Store +/.build +/Packages +/*.xcodeproj +xcuserdata/ +DerivedData/ +.swiftpm/ +.netrc +Package.resolved \ No newline at end of file diff --git a/swift/ArrowFlight/Package.swift b/swift/ArrowFlight/Package.swift new file mode 100644 index 0000000000000..f3caa83486764 --- /dev/null +++ b/swift/ArrowFlight/Package.swift @@ -0,0 +1,53 @@ +// swift-tools-version:5.7 +// The swift-tools-version declares the minimum version of Swift required to build this package. + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import PackageDescription + +let package = Package( + name: "ArrowFlight", + platforms: [ + .macOS(.v10_15) + ], + products: [ + // Products define the executables and libraries a package produces, making them visible to other packages. + .library( + name: "ArrowFlight", + targets: ["ArrowFlight"]), + ], + dependencies: [ + .package(url: "https://github.com/grpc/grpc-swift.git", from: "1.15.0"), + .package(url: "https://github.com/apple/swift-protobuf.git", from: "1.6.0"), + .package(path: "../Arrow") + ], + targets: [ + // Targets are the basic building blocks of a package, defining a module or a test suite. + // Targets can depend on other targets in this package and products from dependencies. + .target( + name: "ArrowFlight", + dependencies: [ + .product(name: "Arrow", package: "Arrow"), + .product(name: "GRPC", package: "grpc-swift"), + .product(name: "SwiftProtobuf", package: "swift-protobuf") + ]), + .testTarget( + name: "ArrowFlightTests", + dependencies: ["ArrowFlight"]), + ] +) diff --git a/swift/ArrowFlight/Sources/ArrowFlight/Flight.grpc.swift b/swift/ArrowFlight/Sources/ArrowFlight/Flight.grpc.swift new file mode 100644 index 0000000000000..8daaa19f07b50 --- /dev/null +++ b/swift/ArrowFlight/Sources/ArrowFlight/Flight.grpc.swift @@ -0,0 +1,1343 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// DO NOT EDIT. +// swift-format-ignore-file +// +// Generated by the protocol buffer compiler. +// Source: Flight.proto +// +import GRPC +import NIO +import NIOConcurrencyHelpers +import SwiftProtobuf + + +/// +/// A flight service is an endpoint for retrieving or storing Arrow data. A +/// flight service can expose one or more predefined endpoints that can be +/// accessed using the Arrow Flight Protocol. Additionally, a flight service +/// can expose a set of actions that are available. +/// +/// Usage: instantiate `Arrow_Flight_Protocol_FlightServiceClient`, then call methods of this protocol to make API calls. +internal protocol Arrow_Flight_Protocol_FlightServiceClientProtocol: GRPCClient { + var serviceName: String { get } + var interceptors: Arrow_Flight_Protocol_FlightServiceClientInterceptorFactoryProtocol? { get } + + func handshake( + callOptions: CallOptions?, + handler: @escaping (Arrow_Flight_Protocol_HandshakeResponse) -> Void + ) -> BidirectionalStreamingCall + + func listFlights( + _ request: Arrow_Flight_Protocol_Criteria, + callOptions: CallOptions?, + handler: @escaping (Arrow_Flight_Protocol_FlightInfo) -> Void + ) -> ServerStreamingCall + + func getFlightInfo( + _ request: Arrow_Flight_Protocol_FlightDescriptor, + callOptions: CallOptions? + ) -> UnaryCall + + func getSchema( + _ request: Arrow_Flight_Protocol_FlightDescriptor, + callOptions: CallOptions? + ) -> UnaryCall + + func doGet( + _ request: Arrow_Flight_Protocol_Ticket, + callOptions: CallOptions?, + handler: @escaping (Arrow_Flight_Protocol_FlightData) -> Void + ) -> ServerStreamingCall + + func doPut( + callOptions: CallOptions?, + handler: @escaping (Arrow_Flight_Protocol_PutResult) -> Void + ) -> BidirectionalStreamingCall + + func doExchange( + callOptions: CallOptions?, + handler: @escaping (Arrow_Flight_Protocol_FlightData) -> Void + ) -> BidirectionalStreamingCall + + func doAction( + _ request: Arrow_Flight_Protocol_Action, + callOptions: CallOptions?, + handler: @escaping (Arrow_Flight_Protocol_Result) -> Void + ) -> ServerStreamingCall + + func listActions( + _ request: Arrow_Flight_Protocol_Empty, + callOptions: CallOptions?, + handler: @escaping (Arrow_Flight_Protocol_ActionType) -> Void + ) -> ServerStreamingCall +} + +extension Arrow_Flight_Protocol_FlightServiceClientProtocol { + internal var serviceName: String { + return "arrow.flight.protocol.FlightService" + } + + /// + /// Handshake between client and server. Depending on the server, the + /// handshake may be required to determine the token that should be used for + /// future operations. Both request and response are streams to allow multiple + /// round-trips depending on auth mechanism. + /// + /// Callers should use the `send` method on the returned object to send messages + /// to the server. The caller should send an `.end` after the final message has been sent. + /// + /// - Parameters: + /// - callOptions: Call options. + /// - handler: A closure called when each response is received from the server. + /// - Returns: A `ClientStreamingCall` with futures for the metadata and status. + internal func handshake( + callOptions: CallOptions? = nil, + handler: @escaping (Arrow_Flight_Protocol_HandshakeResponse) -> Void + ) -> BidirectionalStreamingCall { + return self.makeBidirectionalStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.handshake.path, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeHandshakeInterceptors() ?? [], + handler: handler + ) + } + + /// + /// Get a list of available streams given a particular criteria. Most flight + /// services will expose one or more streams that are readily available for + /// retrieval. This api allows listing the streams available for + /// consumption. A user can also provide a criteria. The criteria can limit + /// the subset of streams that can be listed via this interface. Each flight + /// service allows its own definition of how to consume criteria. + /// + /// - Parameters: + /// - request: Request to send to ListFlights. + /// - callOptions: Call options. + /// - handler: A closure called when each response is received from the server. + /// - Returns: A `ServerStreamingCall` with futures for the metadata and status. + internal func listFlights( + _ request: Arrow_Flight_Protocol_Criteria, + callOptions: CallOptions? = nil, + handler: @escaping (Arrow_Flight_Protocol_FlightInfo) -> Void + ) -> ServerStreamingCall { + return self.makeServerStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.listFlights.path, + request: request, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeListFlightsInterceptors() ?? [], + handler: handler + ) + } + + /// + /// For a given FlightDescriptor, get information about how the flight can be + /// consumed. This is a useful interface if the consumer of the interface + /// already can identify the specific flight to consume. This interface can + /// also allow a consumer to generate a flight stream through a specified + /// descriptor. For example, a flight descriptor might be something that + /// includes a SQL statement or a Pickled Python operation that will be + /// executed. In those cases, the descriptor will not be previously available + /// within the list of available streams provided by ListFlights but will be + /// available for consumption for the duration defined by the specific flight + /// service. + /// + /// - Parameters: + /// - request: Request to send to GetFlightInfo. + /// - callOptions: Call options. + /// - Returns: A `UnaryCall` with futures for the metadata, status and response. + internal func getFlightInfo( + _ request: Arrow_Flight_Protocol_FlightDescriptor, + callOptions: CallOptions? = nil + ) -> UnaryCall { + return self.makeUnaryCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.getFlightInfo.path, + request: request, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeGetFlightInfoInterceptors() ?? [] + ) + } + + /// + /// For a given FlightDescriptor, get the Schema as described in Schema.fbs::Schema + /// This is used when a consumer needs the Schema of flight stream. Similar to + /// GetFlightInfo this interface may generate a new flight that was not previously + /// available in ListFlights. + /// + /// - Parameters: + /// - request: Request to send to GetSchema. + /// - callOptions: Call options. + /// - Returns: A `UnaryCall` with futures for the metadata, status and response. + internal func getSchema( + _ request: Arrow_Flight_Protocol_FlightDescriptor, + callOptions: CallOptions? = nil + ) -> UnaryCall { + return self.makeUnaryCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.getSchema.path, + request: request, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeGetSchemaInterceptors() ?? [] + ) + } + + /// + /// Retrieve a single stream associated with a particular descriptor + /// associated with the referenced ticket. A Flight can be composed of one or + /// more streams where each stream can be retrieved using a separate opaque + /// ticket that the flight service uses for managing a collection of streams. + /// + /// - Parameters: + /// - request: Request to send to DoGet. + /// - callOptions: Call options. + /// - handler: A closure called when each response is received from the server. + /// - Returns: A `ServerStreamingCall` with futures for the metadata and status. + internal func doGet( + _ request: Arrow_Flight_Protocol_Ticket, + callOptions: CallOptions? = nil, + handler: @escaping (Arrow_Flight_Protocol_FlightData) -> Void + ) -> ServerStreamingCall { + return self.makeServerStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.doGet.path, + request: request, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeDoGetInterceptors() ?? [], + handler: handler + ) + } + + /// + /// Push a stream to the flight service associated with a particular + /// flight stream. This allows a client of a flight service to upload a stream + /// of data. Depending on the particular flight service, a client consumer + /// could be allowed to upload a single stream per descriptor or an unlimited + /// number. In the latter, the service might implement a 'seal' action that + /// can be applied to a descriptor once all streams are uploaded. + /// + /// Callers should use the `send` method on the returned object to send messages + /// to the server. The caller should send an `.end` after the final message has been sent. + /// + /// - Parameters: + /// - callOptions: Call options. + /// - handler: A closure called when each response is received from the server. + /// - Returns: A `ClientStreamingCall` with futures for the metadata and status. + internal func doPut( + callOptions: CallOptions? = nil, + handler: @escaping (Arrow_Flight_Protocol_PutResult) -> Void + ) -> BidirectionalStreamingCall { + return self.makeBidirectionalStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.doPut.path, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeDoPutInterceptors() ?? [], + handler: handler + ) + } + + /// + /// Open a bidirectional data channel for a given descriptor. This + /// allows clients to send and receive arbitrary Arrow data and + /// application-specific metadata in a single logical stream. In + /// contrast to DoGet/DoPut, this is more suited for clients + /// offloading computation (rather than storage) to a Flight service. + /// + /// Callers should use the `send` method on the returned object to send messages + /// to the server. The caller should send an `.end` after the final message has been sent. + /// + /// - Parameters: + /// - callOptions: Call options. + /// - handler: A closure called when each response is received from the server. + /// - Returns: A `ClientStreamingCall` with futures for the metadata and status. + internal func doExchange( + callOptions: CallOptions? = nil, + handler: @escaping (Arrow_Flight_Protocol_FlightData) -> Void + ) -> BidirectionalStreamingCall { + return self.makeBidirectionalStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.doExchange.path, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeDoExchangeInterceptors() ?? [], + handler: handler + ) + } + + /// + /// Flight services can support an arbitrary number of simple actions in + /// addition to the possible ListFlights, GetFlightInfo, DoGet, DoPut + /// operations that are potentially available. DoAction allows a flight client + /// to do a specific action against a flight service. An action includes + /// opaque request and response objects that are specific to the type action + /// being undertaken. + /// + /// - Parameters: + /// - request: Request to send to DoAction. + /// - callOptions: Call options. + /// - handler: A closure called when each response is received from the server. + /// - Returns: A `ServerStreamingCall` with futures for the metadata and status. + internal func doAction( + _ request: Arrow_Flight_Protocol_Action, + callOptions: CallOptions? = nil, + handler: @escaping (Arrow_Flight_Protocol_Result) -> Void + ) -> ServerStreamingCall { + return self.makeServerStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.doAction.path, + request: request, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeDoActionInterceptors() ?? [], + handler: handler + ) + } + + /// + /// A flight service exposes all of the available action types that it has + /// along with descriptions. This allows different flight consumers to + /// understand the capabilities of the flight service. + /// + /// - Parameters: + /// - request: Request to send to ListActions. + /// - callOptions: Call options. + /// - handler: A closure called when each response is received from the server. + /// - Returns: A `ServerStreamingCall` with futures for the metadata and status. + internal func listActions( + _ request: Arrow_Flight_Protocol_Empty, + callOptions: CallOptions? = nil, + handler: @escaping (Arrow_Flight_Protocol_ActionType) -> Void + ) -> ServerStreamingCall { + return self.makeServerStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.listActions.path, + request: request, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeListActionsInterceptors() ?? [], + handler: handler + ) + } +} + +@available(*, deprecated) +extension Arrow_Flight_Protocol_FlightServiceClient: @unchecked Sendable {} + +@available(*, deprecated, renamed: "Arrow_Flight_Protocol_FlightServiceNIOClient") +internal final class Arrow_Flight_Protocol_FlightServiceClient: Arrow_Flight_Protocol_FlightServiceClientProtocol { + private let lock = Lock() + private var _defaultCallOptions: CallOptions + private var _interceptors: Arrow_Flight_Protocol_FlightServiceClientInterceptorFactoryProtocol? + internal let channel: GRPCChannel + internal var defaultCallOptions: CallOptions { + get { self.lock.withLock { return self._defaultCallOptions } } + set { self.lock.withLockVoid { self._defaultCallOptions = newValue } } + } + internal var interceptors: Arrow_Flight_Protocol_FlightServiceClientInterceptorFactoryProtocol? { + get { self.lock.withLock { return self._interceptors } } + set { self.lock.withLockVoid { self._interceptors = newValue } } + } + + /// Creates a client for the arrow.flight.protocol.FlightService service. + /// + /// - Parameters: + /// - channel: `GRPCChannel` to the service host. + /// - defaultCallOptions: Options to use for each service call if the user doesn't provide them. + /// - interceptors: A factory providing interceptors for each RPC. + internal init( + channel: GRPCChannel, + defaultCallOptions: CallOptions = CallOptions(), + interceptors: Arrow_Flight_Protocol_FlightServiceClientInterceptorFactoryProtocol? = nil + ) { + self.channel = channel + self._defaultCallOptions = defaultCallOptions + self._interceptors = interceptors + } +} + +internal struct Arrow_Flight_Protocol_FlightServiceNIOClient: Arrow_Flight_Protocol_FlightServiceClientProtocol { + internal var channel: GRPCChannel + internal var defaultCallOptions: CallOptions + internal var interceptors: Arrow_Flight_Protocol_FlightServiceClientInterceptorFactoryProtocol? + + /// Creates a client for the arrow.flight.protocol.FlightService service. + /// + /// - Parameters: + /// - channel: `GRPCChannel` to the service host. + /// - defaultCallOptions: Options to use for each service call if the user doesn't provide them. + /// - interceptors: A factory providing interceptors for each RPC. + internal init( + channel: GRPCChannel, + defaultCallOptions: CallOptions = CallOptions(), + interceptors: Arrow_Flight_Protocol_FlightServiceClientInterceptorFactoryProtocol? = nil + ) { + self.channel = channel + self.defaultCallOptions = defaultCallOptions + self.interceptors = interceptors + } +} + +/// +/// A flight service is an endpoint for retrieving or storing Arrow data. A +/// flight service can expose one or more predefined endpoints that can be +/// accessed using the Arrow Flight Protocol. Additionally, a flight service +/// can expose a set of actions that are available. +@available(macOS 10.15, iOS 13, tvOS 13, watchOS 6, *) +internal protocol Arrow_Flight_Protocol_FlightServiceAsyncClientProtocol: GRPCClient { + static var serviceDescriptor: GRPCServiceDescriptor { get } + var interceptors: Arrow_Flight_Protocol_FlightServiceClientInterceptorFactoryProtocol? { get } + + func makeHandshakeCall( + callOptions: CallOptions? + ) -> GRPCAsyncBidirectionalStreamingCall + + func makeListFlightsCall( + _ request: Arrow_Flight_Protocol_Criteria, + callOptions: CallOptions? + ) -> GRPCAsyncServerStreamingCall + + func makeGetFlightInfoCall( + _ request: Arrow_Flight_Protocol_FlightDescriptor, + callOptions: CallOptions? + ) -> GRPCAsyncUnaryCall + + func makeGetSchemaCall( + _ request: Arrow_Flight_Protocol_FlightDescriptor, + callOptions: CallOptions? + ) -> GRPCAsyncUnaryCall + + func makeDoGetCall( + _ request: Arrow_Flight_Protocol_Ticket, + callOptions: CallOptions? + ) -> GRPCAsyncServerStreamingCall + + func makeDoPutCall( + callOptions: CallOptions? + ) -> GRPCAsyncBidirectionalStreamingCall + + func makeDoExchangeCall( + callOptions: CallOptions? + ) -> GRPCAsyncBidirectionalStreamingCall + + func makeDoActionCall( + _ request: Arrow_Flight_Protocol_Action, + callOptions: CallOptions? + ) -> GRPCAsyncServerStreamingCall + + func makeListActionsCall( + _ request: Arrow_Flight_Protocol_Empty, + callOptions: CallOptions? + ) -> GRPCAsyncServerStreamingCall +} + +@available(macOS 10.15, iOS 13, tvOS 13, watchOS 6, *) +extension Arrow_Flight_Protocol_FlightServiceAsyncClientProtocol { + internal static var serviceDescriptor: GRPCServiceDescriptor { + return Arrow_Flight_Protocol_FlightServiceClientMetadata.serviceDescriptor + } + + internal var interceptors: Arrow_Flight_Protocol_FlightServiceClientInterceptorFactoryProtocol? { + return nil + } + + internal func makeHandshakeCall( + callOptions: CallOptions? = nil + ) -> GRPCAsyncBidirectionalStreamingCall { + return self.makeAsyncBidirectionalStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.handshake.path, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeHandshakeInterceptors() ?? [] + ) + } + + internal func makeListFlightsCall( + _ request: Arrow_Flight_Protocol_Criteria, + callOptions: CallOptions? = nil + ) -> GRPCAsyncServerStreamingCall { + return self.makeAsyncServerStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.listFlights.path, + request: request, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeListFlightsInterceptors() ?? [] + ) + } + + internal func makeGetFlightInfoCall( + _ request: Arrow_Flight_Protocol_FlightDescriptor, + callOptions: CallOptions? = nil + ) -> GRPCAsyncUnaryCall { + return self.makeAsyncUnaryCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.getFlightInfo.path, + request: request, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeGetFlightInfoInterceptors() ?? [] + ) + } + + internal func makeGetSchemaCall( + _ request: Arrow_Flight_Protocol_FlightDescriptor, + callOptions: CallOptions? = nil + ) -> GRPCAsyncUnaryCall { + return self.makeAsyncUnaryCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.getSchema.path, + request: request, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeGetSchemaInterceptors() ?? [] + ) + } + + internal func makeDoGetCall( + _ request: Arrow_Flight_Protocol_Ticket, + callOptions: CallOptions? = nil + ) -> GRPCAsyncServerStreamingCall { + return self.makeAsyncServerStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.doGet.path, + request: request, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeDoGetInterceptors() ?? [] + ) + } + + internal func makeDoPutCall( + callOptions: CallOptions? = nil + ) -> GRPCAsyncBidirectionalStreamingCall { + return self.makeAsyncBidirectionalStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.doPut.path, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeDoPutInterceptors() ?? [] + ) + } + + internal func makeDoExchangeCall( + callOptions: CallOptions? = nil + ) -> GRPCAsyncBidirectionalStreamingCall { + return self.makeAsyncBidirectionalStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.doExchange.path, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeDoExchangeInterceptors() ?? [] + ) + } + + internal func makeDoActionCall( + _ request: Arrow_Flight_Protocol_Action, + callOptions: CallOptions? = nil + ) -> GRPCAsyncServerStreamingCall { + return self.makeAsyncServerStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.doAction.path, + request: request, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeDoActionInterceptors() ?? [] + ) + } + + internal func makeListActionsCall( + _ request: Arrow_Flight_Protocol_Empty, + callOptions: CallOptions? = nil + ) -> GRPCAsyncServerStreamingCall { + return self.makeAsyncServerStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.listActions.path, + request: request, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeListActionsInterceptors() ?? [] + ) + } +} + +@available(macOS 10.15, iOS 13, tvOS 13, watchOS 6, *) +extension Arrow_Flight_Protocol_FlightServiceAsyncClientProtocol { + internal func handshake( + _ requests: RequestStream, + callOptions: CallOptions? = nil + ) -> GRPCAsyncResponseStream where RequestStream: Sequence, RequestStream.Element == Arrow_Flight_Protocol_HandshakeRequest { + return self.performAsyncBidirectionalStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.handshake.path, + requests: requests, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeHandshakeInterceptors() ?? [] + ) + } + + internal func handshake( + _ requests: RequestStream, + callOptions: CallOptions? = nil + ) -> GRPCAsyncResponseStream where RequestStream: AsyncSequence & Sendable, RequestStream.Element == Arrow_Flight_Protocol_HandshakeRequest { + return self.performAsyncBidirectionalStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.handshake.path, + requests: requests, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeHandshakeInterceptors() ?? [] + ) + } + + internal func listFlights( + _ request: Arrow_Flight_Protocol_Criteria, + callOptions: CallOptions? = nil + ) -> GRPCAsyncResponseStream { + return self.performAsyncServerStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.listFlights.path, + request: request, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeListFlightsInterceptors() ?? [] + ) + } + + internal func getFlightInfo( + _ request: Arrow_Flight_Protocol_FlightDescriptor, + callOptions: CallOptions? = nil + ) async throws -> Arrow_Flight_Protocol_FlightInfo { + return try await self.performAsyncUnaryCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.getFlightInfo.path, + request: request, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeGetFlightInfoInterceptors() ?? [] + ) + } + + internal func getSchema( + _ request: Arrow_Flight_Protocol_FlightDescriptor, + callOptions: CallOptions? = nil + ) async throws -> Arrow_Flight_Protocol_SchemaResult { + return try await self.performAsyncUnaryCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.getSchema.path, + request: request, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeGetSchemaInterceptors() ?? [] + ) + } + + internal func doGet( + _ request: Arrow_Flight_Protocol_Ticket, + callOptions: CallOptions? = nil + ) -> GRPCAsyncResponseStream { + return self.performAsyncServerStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.doGet.path, + request: request, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeDoGetInterceptors() ?? [] + ) + } + + internal func doPut( + _ requests: RequestStream, + callOptions: CallOptions? = nil + ) -> GRPCAsyncResponseStream where RequestStream: Sequence, RequestStream.Element == Arrow_Flight_Protocol_FlightData { + return self.performAsyncBidirectionalStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.doPut.path, + requests: requests, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeDoPutInterceptors() ?? [] + ) + } + + internal func doPut( + _ requests: RequestStream, + callOptions: CallOptions? = nil + ) -> GRPCAsyncResponseStream where RequestStream: AsyncSequence & Sendable, RequestStream.Element == Arrow_Flight_Protocol_FlightData { + return self.performAsyncBidirectionalStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.doPut.path, + requests: requests, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeDoPutInterceptors() ?? [] + ) + } + + internal func doExchange( + _ requests: RequestStream, + callOptions: CallOptions? = nil + ) -> GRPCAsyncResponseStream where RequestStream: Sequence, RequestStream.Element == Arrow_Flight_Protocol_FlightData { + return self.performAsyncBidirectionalStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.doExchange.path, + requests: requests, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeDoExchangeInterceptors() ?? [] + ) + } + + internal func doExchange( + _ requests: RequestStream, + callOptions: CallOptions? = nil + ) -> GRPCAsyncResponseStream where RequestStream: AsyncSequence & Sendable, RequestStream.Element == Arrow_Flight_Protocol_FlightData { + return self.performAsyncBidirectionalStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.doExchange.path, + requests: requests, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeDoExchangeInterceptors() ?? [] + ) + } + + internal func doAction( + _ request: Arrow_Flight_Protocol_Action, + callOptions: CallOptions? = nil + ) -> GRPCAsyncResponseStream { + return self.performAsyncServerStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.doAction.path, + request: request, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeDoActionInterceptors() ?? [] + ) + } + + internal func listActions( + _ request: Arrow_Flight_Protocol_Empty, + callOptions: CallOptions? = nil + ) -> GRPCAsyncResponseStream { + return self.performAsyncServerStreamingCall( + path: Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.listActions.path, + request: request, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeListActionsInterceptors() ?? [] + ) + } +} + +@available(macOS 10.15, iOS 13, tvOS 13, watchOS 6, *) +internal struct Arrow_Flight_Protocol_FlightServiceAsyncClient: Arrow_Flight_Protocol_FlightServiceAsyncClientProtocol { + internal var channel: GRPCChannel + internal var defaultCallOptions: CallOptions + internal var interceptors: Arrow_Flight_Protocol_FlightServiceClientInterceptorFactoryProtocol? + + internal init( + channel: GRPCChannel, + defaultCallOptions: CallOptions = CallOptions(), + interceptors: Arrow_Flight_Protocol_FlightServiceClientInterceptorFactoryProtocol? = nil + ) { + self.channel = channel + self.defaultCallOptions = defaultCallOptions + self.interceptors = interceptors + } +} + +internal protocol Arrow_Flight_Protocol_FlightServiceClientInterceptorFactoryProtocol: Sendable { + + /// - Returns: Interceptors to use when invoking 'handshake'. + func makeHandshakeInterceptors() -> [ClientInterceptor] + + /// - Returns: Interceptors to use when invoking 'listFlights'. + func makeListFlightsInterceptors() -> [ClientInterceptor] + + /// - Returns: Interceptors to use when invoking 'getFlightInfo'. + func makeGetFlightInfoInterceptors() -> [ClientInterceptor] + + /// - Returns: Interceptors to use when invoking 'getSchema'. + func makeGetSchemaInterceptors() -> [ClientInterceptor] + + /// - Returns: Interceptors to use when invoking 'doGet'. + func makeDoGetInterceptors() -> [ClientInterceptor] + + /// - Returns: Interceptors to use when invoking 'doPut'. + func makeDoPutInterceptors() -> [ClientInterceptor] + + /// - Returns: Interceptors to use when invoking 'doExchange'. + func makeDoExchangeInterceptors() -> [ClientInterceptor] + + /// - Returns: Interceptors to use when invoking 'doAction'. + func makeDoActionInterceptors() -> [ClientInterceptor] + + /// - Returns: Interceptors to use when invoking 'listActions'. + func makeListActionsInterceptors() -> [ClientInterceptor] +} + +internal enum Arrow_Flight_Protocol_FlightServiceClientMetadata { + internal static let serviceDescriptor = GRPCServiceDescriptor( + name: "FlightService", + fullName: "arrow.flight.protocol.FlightService", + methods: [ + Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.handshake, + Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.listFlights, + Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.getFlightInfo, + Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.getSchema, + Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.doGet, + Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.doPut, + Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.doExchange, + Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.doAction, + Arrow_Flight_Protocol_FlightServiceClientMetadata.Methods.listActions, + ] + ) + + internal enum Methods { + internal static let handshake = GRPCMethodDescriptor( + name: "Handshake", + path: "/arrow.flight.protocol.FlightService/Handshake", + type: GRPCCallType.bidirectionalStreaming + ) + + internal static let listFlights = GRPCMethodDescriptor( + name: "ListFlights", + path: "/arrow.flight.protocol.FlightService/ListFlights", + type: GRPCCallType.serverStreaming + ) + + internal static let getFlightInfo = GRPCMethodDescriptor( + name: "GetFlightInfo", + path: "/arrow.flight.protocol.FlightService/GetFlightInfo", + type: GRPCCallType.unary + ) + + internal static let getSchema = GRPCMethodDescriptor( + name: "GetSchema", + path: "/arrow.flight.protocol.FlightService/GetSchema", + type: GRPCCallType.unary + ) + + internal static let doGet = GRPCMethodDescriptor( + name: "DoGet", + path: "/arrow.flight.protocol.FlightService/DoGet", + type: GRPCCallType.serverStreaming + ) + + internal static let doPut = GRPCMethodDescriptor( + name: "DoPut", + path: "/arrow.flight.protocol.FlightService/DoPut", + type: GRPCCallType.bidirectionalStreaming + ) + + internal static let doExchange = GRPCMethodDescriptor( + name: "DoExchange", + path: "/arrow.flight.protocol.FlightService/DoExchange", + type: GRPCCallType.bidirectionalStreaming + ) + + internal static let doAction = GRPCMethodDescriptor( + name: "DoAction", + path: "/arrow.flight.protocol.FlightService/DoAction", + type: GRPCCallType.serverStreaming + ) + + internal static let listActions = GRPCMethodDescriptor( + name: "ListActions", + path: "/arrow.flight.protocol.FlightService/ListActions", + type: GRPCCallType.serverStreaming + ) + } +} + +/// +/// A flight service is an endpoint for retrieving or storing Arrow data. A +/// flight service can expose one or more predefined endpoints that can be +/// accessed using the Arrow Flight Protocol. Additionally, a flight service +/// can expose a set of actions that are available. +/// +/// To build a server, implement a class that conforms to this protocol. +internal protocol Arrow_Flight_Protocol_FlightServiceProvider: CallHandlerProvider { + var interceptors: Arrow_Flight_Protocol_FlightServiceServerInterceptorFactoryProtocol? { get } + + /// + /// Handshake between client and server. Depending on the server, the + /// handshake may be required to determine the token that should be used for + /// future operations. Both request and response are streams to allow multiple + /// round-trips depending on auth mechanism. + func handshake(context: StreamingResponseCallContext) -> EventLoopFuture<(StreamEvent) -> Void> + + /// + /// Get a list of available streams given a particular criteria. Most flight + /// services will expose one or more streams that are readily available for + /// retrieval. This api allows listing the streams available for + /// consumption. A user can also provide a criteria. The criteria can limit + /// the subset of streams that can be listed via this interface. Each flight + /// service allows its own definition of how to consume criteria. + func listFlights(request: Arrow_Flight_Protocol_Criteria, context: StreamingResponseCallContext) -> EventLoopFuture + + /// + /// For a given FlightDescriptor, get information about how the flight can be + /// consumed. This is a useful interface if the consumer of the interface + /// already can identify the specific flight to consume. This interface can + /// also allow a consumer to generate a flight stream through a specified + /// descriptor. For example, a flight descriptor might be something that + /// includes a SQL statement or a Pickled Python operation that will be + /// executed. In those cases, the descriptor will not be previously available + /// within the list of available streams provided by ListFlights but will be + /// available for consumption for the duration defined by the specific flight + /// service. + func getFlightInfo(request: Arrow_Flight_Protocol_FlightDescriptor, context: StatusOnlyCallContext) -> EventLoopFuture + + /// + /// For a given FlightDescriptor, get the Schema as described in Schema.fbs::Schema + /// This is used when a consumer needs the Schema of flight stream. Similar to + /// GetFlightInfo this interface may generate a new flight that was not previously + /// available in ListFlights. + func getSchema(request: Arrow_Flight_Protocol_FlightDescriptor, context: StatusOnlyCallContext) -> EventLoopFuture + + /// + /// Retrieve a single stream associated with a particular descriptor + /// associated with the referenced ticket. A Flight can be composed of one or + /// more streams where each stream can be retrieved using a separate opaque + /// ticket that the flight service uses for managing a collection of streams. + func doGet(request: Arrow_Flight_Protocol_Ticket, context: StreamingResponseCallContext) -> EventLoopFuture + + /// + /// Push a stream to the flight service associated with a particular + /// flight stream. This allows a client of a flight service to upload a stream + /// of data. Depending on the particular flight service, a client consumer + /// could be allowed to upload a single stream per descriptor or an unlimited + /// number. In the latter, the service might implement a 'seal' action that + /// can be applied to a descriptor once all streams are uploaded. + func doPut(context: StreamingResponseCallContext) -> EventLoopFuture<(StreamEvent) -> Void> + + /// + /// Open a bidirectional data channel for a given descriptor. This + /// allows clients to send and receive arbitrary Arrow data and + /// application-specific metadata in a single logical stream. In + /// contrast to DoGet/DoPut, this is more suited for clients + /// offloading computation (rather than storage) to a Flight service. + func doExchange(context: StreamingResponseCallContext) -> EventLoopFuture<(StreamEvent) -> Void> + + /// + /// Flight services can support an arbitrary number of simple actions in + /// addition to the possible ListFlights, GetFlightInfo, DoGet, DoPut + /// operations that are potentially available. DoAction allows a flight client + /// to do a specific action against a flight service. An action includes + /// opaque request and response objects that are specific to the type action + /// being undertaken. + func doAction(request: Arrow_Flight_Protocol_Action, context: StreamingResponseCallContext) -> EventLoopFuture + + /// + /// A flight service exposes all of the available action types that it has + /// along with descriptions. This allows different flight consumers to + /// understand the capabilities of the flight service. + func listActions(request: Arrow_Flight_Protocol_Empty, context: StreamingResponseCallContext) -> EventLoopFuture +} + +extension Arrow_Flight_Protocol_FlightServiceProvider { + internal var serviceName: Substring { + return Arrow_Flight_Protocol_FlightServiceServerMetadata.serviceDescriptor.fullName[...] + } + + /// Determines, calls and returns the appropriate request handler, depending on the request's method. + /// Returns nil for methods not handled by this service. + internal func handle( + method name: Substring, + context: CallHandlerContext + ) -> GRPCServerHandlerProtocol? { + switch name { + case "Handshake": + return BidirectionalStreamingServerHandler( + context: context, + requestDeserializer: ProtobufDeserializer(), + responseSerializer: ProtobufSerializer(), + interceptors: self.interceptors?.makeHandshakeInterceptors() ?? [], + observerFactory: self.handshake(context:) + ) + + case "ListFlights": + return ServerStreamingServerHandler( + context: context, + requestDeserializer: ProtobufDeserializer(), + responseSerializer: ProtobufSerializer(), + interceptors: self.interceptors?.makeListFlightsInterceptors() ?? [], + userFunction: self.listFlights(request:context:) + ) + + case "GetFlightInfo": + return UnaryServerHandler( + context: context, + requestDeserializer: ProtobufDeserializer(), + responseSerializer: ProtobufSerializer(), + interceptors: self.interceptors?.makeGetFlightInfoInterceptors() ?? [], + userFunction: self.getFlightInfo(request:context:) + ) + + case "GetSchema": + return UnaryServerHandler( + context: context, + requestDeserializer: ProtobufDeserializer(), + responseSerializer: ProtobufSerializer(), + interceptors: self.interceptors?.makeGetSchemaInterceptors() ?? [], + userFunction: self.getSchema(request:context:) + ) + + case "DoGet": + return ServerStreamingServerHandler( + context: context, + requestDeserializer: ProtobufDeserializer(), + responseSerializer: ProtobufSerializer(), + interceptors: self.interceptors?.makeDoGetInterceptors() ?? [], + userFunction: self.doGet(request:context:) + ) + + case "DoPut": + return BidirectionalStreamingServerHandler( + context: context, + requestDeserializer: ProtobufDeserializer(), + responseSerializer: ProtobufSerializer(), + interceptors: self.interceptors?.makeDoPutInterceptors() ?? [], + observerFactory: self.doPut(context:) + ) + + case "DoExchange": + return BidirectionalStreamingServerHandler( + context: context, + requestDeserializer: ProtobufDeserializer(), + responseSerializer: ProtobufSerializer(), + interceptors: self.interceptors?.makeDoExchangeInterceptors() ?? [], + observerFactory: self.doExchange(context:) + ) + + case "DoAction": + return ServerStreamingServerHandler( + context: context, + requestDeserializer: ProtobufDeserializer(), + responseSerializer: ProtobufSerializer(), + interceptors: self.interceptors?.makeDoActionInterceptors() ?? [], + userFunction: self.doAction(request:context:) + ) + + case "ListActions": + return ServerStreamingServerHandler( + context: context, + requestDeserializer: ProtobufDeserializer(), + responseSerializer: ProtobufSerializer(), + interceptors: self.interceptors?.makeListActionsInterceptors() ?? [], + userFunction: self.listActions(request:context:) + ) + + default: + return nil + } + } +} + +/// +/// A flight service is an endpoint for retrieving or storing Arrow data. A +/// flight service can expose one or more predefined endpoints that can be +/// accessed using the Arrow Flight Protocol. Additionally, a flight service +/// can expose a set of actions that are available. +/// +/// To implement a server, implement an object which conforms to this protocol. +@available(macOS 10.15, iOS 13, tvOS 13, watchOS 6, *) +internal protocol Arrow_Flight_Protocol_FlightServiceAsyncProvider: CallHandlerProvider, Sendable { + static var serviceDescriptor: GRPCServiceDescriptor { get } + var interceptors: Arrow_Flight_Protocol_FlightServiceServerInterceptorFactoryProtocol? { get } + + /// + /// Handshake between client and server. Depending on the server, the + /// handshake may be required to determine the token that should be used for + /// future operations. Both request and response are streams to allow multiple + /// round-trips depending on auth mechanism. + func handshake( + requestStream: GRPCAsyncRequestStream, + responseStream: GRPCAsyncResponseStreamWriter, + context: GRPCAsyncServerCallContext + ) async throws + + /// + /// Get a list of available streams given a particular criteria. Most flight + /// services will expose one or more streams that are readily available for + /// retrieval. This api allows listing the streams available for + /// consumption. A user can also provide a criteria. The criteria can limit + /// the subset of streams that can be listed via this interface. Each flight + /// service allows its own definition of how to consume criteria. + func listFlights( + request: Arrow_Flight_Protocol_Criteria, + responseStream: GRPCAsyncResponseStreamWriter, + context: GRPCAsyncServerCallContext + ) async throws + + /// + /// For a given FlightDescriptor, get information about how the flight can be + /// consumed. This is a useful interface if the consumer of the interface + /// already can identify the specific flight to consume. This interface can + /// also allow a consumer to generate a flight stream through a specified + /// descriptor. For example, a flight descriptor might be something that + /// includes a SQL statement or a Pickled Python operation that will be + /// executed. In those cases, the descriptor will not be previously available + /// within the list of available streams provided by ListFlights but will be + /// available for consumption for the duration defined by the specific flight + /// service. + func getFlightInfo( + request: Arrow_Flight_Protocol_FlightDescriptor, + context: GRPCAsyncServerCallContext + ) async throws -> Arrow_Flight_Protocol_FlightInfo + + /// + /// For a given FlightDescriptor, get the Schema as described in Schema.fbs::Schema + /// This is used when a consumer needs the Schema of flight stream. Similar to + /// GetFlightInfo this interface may generate a new flight that was not previously + /// available in ListFlights. + func getSchema( + request: Arrow_Flight_Protocol_FlightDescriptor, + context: GRPCAsyncServerCallContext + ) async throws -> Arrow_Flight_Protocol_SchemaResult + + /// + /// Retrieve a single stream associated with a particular descriptor + /// associated with the referenced ticket. A Flight can be composed of one or + /// more streams where each stream can be retrieved using a separate opaque + /// ticket that the flight service uses for managing a collection of streams. + func doGet( + request: Arrow_Flight_Protocol_Ticket, + responseStream: GRPCAsyncResponseStreamWriter, + context: GRPCAsyncServerCallContext + ) async throws + + /// + /// Push a stream to the flight service associated with a particular + /// flight stream. This allows a client of a flight service to upload a stream + /// of data. Depending on the particular flight service, a client consumer + /// could be allowed to upload a single stream per descriptor or an unlimited + /// number. In the latter, the service might implement a 'seal' action that + /// can be applied to a descriptor once all streams are uploaded. + func doPut( + requestStream: GRPCAsyncRequestStream, + responseStream: GRPCAsyncResponseStreamWriter, + context: GRPCAsyncServerCallContext + ) async throws + + /// + /// Open a bidirectional data channel for a given descriptor. This + /// allows clients to send and receive arbitrary Arrow data and + /// application-specific metadata in a single logical stream. In + /// contrast to DoGet/DoPut, this is more suited for clients + /// offloading computation (rather than storage) to a Flight service. + func doExchange( + requestStream: GRPCAsyncRequestStream, + responseStream: GRPCAsyncResponseStreamWriter, + context: GRPCAsyncServerCallContext + ) async throws + + /// + /// Flight services can support an arbitrary number of simple actions in + /// addition to the possible ListFlights, GetFlightInfo, DoGet, DoPut + /// operations that are potentially available. DoAction allows a flight client + /// to do a specific action against a flight service. An action includes + /// opaque request and response objects that are specific to the type action + /// being undertaken. + func doAction( + request: Arrow_Flight_Protocol_Action, + responseStream: GRPCAsyncResponseStreamWriter, + context: GRPCAsyncServerCallContext + ) async throws + + /// + /// A flight service exposes all of the available action types that it has + /// along with descriptions. This allows different flight consumers to + /// understand the capabilities of the flight service. + func listActions( + request: Arrow_Flight_Protocol_Empty, + responseStream: GRPCAsyncResponseStreamWriter, + context: GRPCAsyncServerCallContext + ) async throws +} + +@available(macOS 10.15, iOS 13, tvOS 13, watchOS 6, *) +extension Arrow_Flight_Protocol_FlightServiceAsyncProvider { + internal static var serviceDescriptor: GRPCServiceDescriptor { + return Arrow_Flight_Protocol_FlightServiceServerMetadata.serviceDescriptor + } + + internal var serviceName: Substring { + return Arrow_Flight_Protocol_FlightServiceServerMetadata.serviceDescriptor.fullName[...] + } + + internal var interceptors: Arrow_Flight_Protocol_FlightServiceServerInterceptorFactoryProtocol? { + return nil + } + + internal func handle( + method name: Substring, + context: CallHandlerContext + ) -> GRPCServerHandlerProtocol? { + switch name { + case "Handshake": + return GRPCAsyncServerHandler( + context: context, + requestDeserializer: ProtobufDeserializer(), + responseSerializer: ProtobufSerializer(), + interceptors: self.interceptors?.makeHandshakeInterceptors() ?? [], + wrapping: { try await self.handshake(requestStream: $0, responseStream: $1, context: $2) } + ) + + case "ListFlights": + return GRPCAsyncServerHandler( + context: context, + requestDeserializer: ProtobufDeserializer(), + responseSerializer: ProtobufSerializer(), + interceptors: self.interceptors?.makeListFlightsInterceptors() ?? [], + wrapping: { try await self.listFlights(request: $0, responseStream: $1, context: $2) } + ) + + case "GetFlightInfo": + return GRPCAsyncServerHandler( + context: context, + requestDeserializer: ProtobufDeserializer(), + responseSerializer: ProtobufSerializer(), + interceptors: self.interceptors?.makeGetFlightInfoInterceptors() ?? [], + wrapping: { try await self.getFlightInfo(request: $0, context: $1) } + ) + + case "GetSchema": + return GRPCAsyncServerHandler( + context: context, + requestDeserializer: ProtobufDeserializer(), + responseSerializer: ProtobufSerializer(), + interceptors: self.interceptors?.makeGetSchemaInterceptors() ?? [], + wrapping: { try await self.getSchema(request: $0, context: $1) } + ) + + case "DoGet": + return GRPCAsyncServerHandler( + context: context, + requestDeserializer: ProtobufDeserializer(), + responseSerializer: ProtobufSerializer(), + interceptors: self.interceptors?.makeDoGetInterceptors() ?? [], + wrapping: { try await self.doGet(request: $0, responseStream: $1, context: $2) } + ) + + case "DoPut": + return GRPCAsyncServerHandler( + context: context, + requestDeserializer: ProtobufDeserializer(), + responseSerializer: ProtobufSerializer(), + interceptors: self.interceptors?.makeDoPutInterceptors() ?? [], + wrapping: { try await self.doPut(requestStream: $0, responseStream: $1, context: $2) } + ) + + case "DoExchange": + return GRPCAsyncServerHandler( + context: context, + requestDeserializer: ProtobufDeserializer(), + responseSerializer: ProtobufSerializer(), + interceptors: self.interceptors?.makeDoExchangeInterceptors() ?? [], + wrapping: { try await self.doExchange(requestStream: $0, responseStream: $1, context: $2) } + ) + + case "DoAction": + return GRPCAsyncServerHandler( + context: context, + requestDeserializer: ProtobufDeserializer(), + responseSerializer: ProtobufSerializer(), + interceptors: self.interceptors?.makeDoActionInterceptors() ?? [], + wrapping: { try await self.doAction(request: $0, responseStream: $1, context: $2) } + ) + + case "ListActions": + return GRPCAsyncServerHandler( + context: context, + requestDeserializer: ProtobufDeserializer(), + responseSerializer: ProtobufSerializer(), + interceptors: self.interceptors?.makeListActionsInterceptors() ?? [], + wrapping: { try await self.listActions(request: $0, responseStream: $1, context: $2) } + ) + + default: + return nil + } + } +} + +internal protocol Arrow_Flight_Protocol_FlightServiceServerInterceptorFactoryProtocol: Sendable { + + /// - Returns: Interceptors to use when handling 'handshake'. + /// Defaults to calling `self.makeInterceptors()`. + func makeHandshakeInterceptors() -> [ServerInterceptor] + + /// - Returns: Interceptors to use when handling 'listFlights'. + /// Defaults to calling `self.makeInterceptors()`. + func makeListFlightsInterceptors() -> [ServerInterceptor] + + /// - Returns: Interceptors to use when handling 'getFlightInfo'. + /// Defaults to calling `self.makeInterceptors()`. + func makeGetFlightInfoInterceptors() -> [ServerInterceptor] + + /// - Returns: Interceptors to use when handling 'getSchema'. + /// Defaults to calling `self.makeInterceptors()`. + func makeGetSchemaInterceptors() -> [ServerInterceptor] + + /// - Returns: Interceptors to use when handling 'doGet'. + /// Defaults to calling `self.makeInterceptors()`. + func makeDoGetInterceptors() -> [ServerInterceptor] + + /// - Returns: Interceptors to use when handling 'doPut'. + /// Defaults to calling `self.makeInterceptors()`. + func makeDoPutInterceptors() -> [ServerInterceptor] + + /// - Returns: Interceptors to use when handling 'doExchange'. + /// Defaults to calling `self.makeInterceptors()`. + func makeDoExchangeInterceptors() -> [ServerInterceptor] + + /// - Returns: Interceptors to use when handling 'doAction'. + /// Defaults to calling `self.makeInterceptors()`. + func makeDoActionInterceptors() -> [ServerInterceptor] + + /// - Returns: Interceptors to use when handling 'listActions'. + /// Defaults to calling `self.makeInterceptors()`. + func makeListActionsInterceptors() -> [ServerInterceptor] +} + +internal enum Arrow_Flight_Protocol_FlightServiceServerMetadata { + internal static let serviceDescriptor = GRPCServiceDescriptor( + name: "FlightService", + fullName: "arrow.flight.protocol.FlightService", + methods: [ + Arrow_Flight_Protocol_FlightServiceServerMetadata.Methods.handshake, + Arrow_Flight_Protocol_FlightServiceServerMetadata.Methods.listFlights, + Arrow_Flight_Protocol_FlightServiceServerMetadata.Methods.getFlightInfo, + Arrow_Flight_Protocol_FlightServiceServerMetadata.Methods.getSchema, + Arrow_Flight_Protocol_FlightServiceServerMetadata.Methods.doGet, + Arrow_Flight_Protocol_FlightServiceServerMetadata.Methods.doPut, + Arrow_Flight_Protocol_FlightServiceServerMetadata.Methods.doExchange, + Arrow_Flight_Protocol_FlightServiceServerMetadata.Methods.doAction, + Arrow_Flight_Protocol_FlightServiceServerMetadata.Methods.listActions, + ] + ) + + internal enum Methods { + internal static let handshake = GRPCMethodDescriptor( + name: "Handshake", + path: "/arrow.flight.protocol.FlightService/Handshake", + type: GRPCCallType.bidirectionalStreaming + ) + + internal static let listFlights = GRPCMethodDescriptor( + name: "ListFlights", + path: "/arrow.flight.protocol.FlightService/ListFlights", + type: GRPCCallType.serverStreaming + ) + + internal static let getFlightInfo = GRPCMethodDescriptor( + name: "GetFlightInfo", + path: "/arrow.flight.protocol.FlightService/GetFlightInfo", + type: GRPCCallType.unary + ) + + internal static let getSchema = GRPCMethodDescriptor( + name: "GetSchema", + path: "/arrow.flight.protocol.FlightService/GetSchema", + type: GRPCCallType.unary + ) + + internal static let doGet = GRPCMethodDescriptor( + name: "DoGet", + path: "/arrow.flight.protocol.FlightService/DoGet", + type: GRPCCallType.serverStreaming + ) + + internal static let doPut = GRPCMethodDescriptor( + name: "DoPut", + path: "/arrow.flight.protocol.FlightService/DoPut", + type: GRPCCallType.bidirectionalStreaming + ) + + internal static let doExchange = GRPCMethodDescriptor( + name: "DoExchange", + path: "/arrow.flight.protocol.FlightService/DoExchange", + type: GRPCCallType.bidirectionalStreaming + ) + + internal static let doAction = GRPCMethodDescriptor( + name: "DoAction", + path: "/arrow.flight.protocol.FlightService/DoAction", + type: GRPCCallType.serverStreaming + ) + + internal static let listActions = GRPCMethodDescriptor( + name: "ListActions", + path: "/arrow.flight.protocol.FlightService/ListActions", + type: GRPCCallType.serverStreaming + ) + } +} diff --git a/swift/ArrowFlight/Sources/ArrowFlight/Flight.pb.swift b/swift/ArrowFlight/Sources/ArrowFlight/Flight.pb.swift new file mode 100644 index 0000000000000..b50d4062529c2 --- /dev/null +++ b/swift/ArrowFlight/Sources/ArrowFlight/Flight.pb.swift @@ -0,0 +1,1366 @@ +// DO NOT EDIT. +// swift-format-ignore-file +// +// Generated by the Swift generator plugin for the protocol buffer compiler. +// Source: Flight.proto +// +// For information on using the generated types, please see the documentation: +// https://github.com/apple/swift-protobuf/ + +// +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +//

+// http://www.apache.org/licenses/LICENSE-2.0 +//

+// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation +import SwiftProtobuf + +// If the compiler emits an error on this type, it is because this file +// was generated by a version of the `protoc` Swift plug-in that is +// incompatible with the version of SwiftProtobuf to which you are linking. +// Please ensure that you are building against the same version of the API +// that was used to generate this file. +fileprivate struct _GeneratedWithProtocGenSwiftVersion: SwiftProtobuf.ProtobufAPIVersionCheck { + struct _2: SwiftProtobuf.ProtobufAPIVersion_2 {} + typealias Version = _2 +} + +/// +/// The result of a cancel operation. +/// +/// This is used by CancelFlightInfoResult.status. +enum Arrow_Flight_Protocol_CancelStatus: SwiftProtobuf.Enum { + typealias RawValue = Int + + /// The cancellation status is unknown. Servers should avoid using + /// this value (send a NOT_FOUND error if the requested query is + /// not known). Clients can retry the request. + case unspecified // = 0 + + /// The cancellation request is complete. Subsequent requests with + /// the same payload may return CANCELLED or a NOT_FOUND error. + case cancelled // = 1 + + /// The cancellation request is in progress. The client may retry + /// the cancellation request. + case cancelling // = 2 + + /// The query is not cancellable. The client should not retry the + /// cancellation request. + case notCancellable // = 3 + case UNRECOGNIZED(Int) + + init() { + self = .unspecified + } + + init?(rawValue: Int) { + switch rawValue { + case 0: self = .unspecified + case 1: self = .cancelled + case 2: self = .cancelling + case 3: self = .notCancellable + default: self = .UNRECOGNIZED(rawValue) + } + } + + var rawValue: Int { + switch self { + case .unspecified: return 0 + case .cancelled: return 1 + case .cancelling: return 2 + case .notCancellable: return 3 + case .UNRECOGNIZED(let i): return i + } + } + +} + +#if swift(>=4.2) + +extension Arrow_Flight_Protocol_CancelStatus: CaseIterable { + // The compiler won't synthesize support with the UNRECOGNIZED case. + static var allCases: [Arrow_Flight_Protocol_CancelStatus] = [ + .unspecified, + .cancelled, + .cancelling, + .notCancellable, + ] +} + +#endif // swift(>=4.2) + +/// +/// The request that a client provides to a server on handshake. +struct Arrow_Flight_Protocol_HandshakeRequest { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + /// + /// A defined protocol version + var protocolVersion: UInt64 = 0 + + /// + /// Arbitrary auth/handshake info. + var payload: Data = Data() + + var unknownFields = SwiftProtobuf.UnknownStorage() + + init() {} +} + +struct Arrow_Flight_Protocol_HandshakeResponse { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + /// + /// A defined protocol version + var protocolVersion: UInt64 = 0 + + /// + /// Arbitrary auth/handshake info. + var payload: Data = Data() + + var unknownFields = SwiftProtobuf.UnknownStorage() + + init() {} +} + +/// +/// A message for doing simple auth. +struct Arrow_Flight_Protocol_BasicAuth { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + var username: String = String() + + var password: String = String() + + var unknownFields = SwiftProtobuf.UnknownStorage() + + init() {} +} + +struct Arrow_Flight_Protocol_Empty { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + var unknownFields = SwiftProtobuf.UnknownStorage() + + init() {} +} + +/// +/// Describes an available action, including both the name used for execution +/// along with a short description of the purpose of the action. +struct Arrow_Flight_Protocol_ActionType { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + var type: String = String() + + var description_p: String = String() + + var unknownFields = SwiftProtobuf.UnknownStorage() + + init() {} +} + +/// +/// A service specific expression that can be used to return a limited set +/// of available Arrow Flight streams. +struct Arrow_Flight_Protocol_Criteria { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + var expression: Data = Data() + + var unknownFields = SwiftProtobuf.UnknownStorage() + + init() {} +} + +/// +/// An opaque action specific for the service. +struct Arrow_Flight_Protocol_Action { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + var type: String = String() + + var body: Data = Data() + + var unknownFields = SwiftProtobuf.UnknownStorage() + + init() {} +} + +/// +/// The request of the CancelFlightInfo action. +/// +/// The request should be stored in Action.body. +struct Arrow_Flight_Protocol_CancelFlightInfoRequest { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + var info: Arrow_Flight_Protocol_FlightInfo { + get {return _info ?? Arrow_Flight_Protocol_FlightInfo()} + set {_info = newValue} + } + /// Returns true if `info` has been explicitly set. + var hasInfo: Bool {return self._info != nil} + /// Clears the value of `info`. Subsequent reads from it will return its default value. + mutating func clearInfo() {self._info = nil} + + var unknownFields = SwiftProtobuf.UnknownStorage() + + init() {} + + fileprivate var _info: Arrow_Flight_Protocol_FlightInfo? = nil +} + +/// +/// The request of the RenewFlightEndpoint action. +/// +/// The request should be stored in Action.body. +struct Arrow_Flight_Protocol_RenewFlightEndpointRequest { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + var endpoint: Arrow_Flight_Protocol_FlightEndpoint { + get {return _endpoint ?? Arrow_Flight_Protocol_FlightEndpoint()} + set {_endpoint = newValue} + } + /// Returns true if `endpoint` has been explicitly set. + var hasEndpoint: Bool {return self._endpoint != nil} + /// Clears the value of `endpoint`. Subsequent reads from it will return its default value. + mutating func clearEndpoint() {self._endpoint = nil} + + var unknownFields = SwiftProtobuf.UnknownStorage() + + init() {} + + fileprivate var _endpoint: Arrow_Flight_Protocol_FlightEndpoint? = nil +} + +/// +/// An opaque result returned after executing an action. +struct Arrow_Flight_Protocol_Result { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + var body: Data = Data() + + var unknownFields = SwiftProtobuf.UnknownStorage() + + init() {} +} + +/// +/// The result of the CancelFlightInfo action. +/// +/// The result should be stored in Result.body. +struct Arrow_Flight_Protocol_CancelFlightInfoResult { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + var status: Arrow_Flight_Protocol_CancelStatus = .unspecified + + var unknownFields = SwiftProtobuf.UnknownStorage() + + init() {} +} + +/// +/// Wrap the result of a getSchema call +struct Arrow_Flight_Protocol_SchemaResult { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + /// The schema of the dataset in its IPC form: + /// 4 bytes - an optional IPC_CONTINUATION_TOKEN prefix + /// 4 bytes - the byte length of the payload + /// a flatbuffer Message whose header is the Schema + var schema: Data = Data() + + var unknownFields = SwiftProtobuf.UnknownStorage() + + init() {} +} + +/// +/// The name or tag for a Flight. May be used as a way to retrieve or generate +/// a flight or be used to expose a set of previously defined flights. +struct Arrow_Flight_Protocol_FlightDescriptor { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + var type: Arrow_Flight_Protocol_FlightDescriptor.DescriptorType = .unknown + + /// + /// Opaque value used to express a command. Should only be defined when + /// type = CMD. + var cmd: Data = Data() + + /// + /// List of strings identifying a particular dataset. Should only be defined + /// when type = PATH. + var path: [String] = [] + + var unknownFields = SwiftProtobuf.UnknownStorage() + + /// + /// Describes what type of descriptor is defined. + enum DescriptorType: SwiftProtobuf.Enum { + typealias RawValue = Int + + /// Protobuf pattern, not used. + case unknown // = 0 + + /// + /// A named path that identifies a dataset. A path is composed of a string + /// or list of strings describing a particular dataset. This is conceptually + /// similar to a path inside a filesystem. + case path // = 1 + + /// + /// An opaque command to generate a dataset. + case cmd // = 2 + case UNRECOGNIZED(Int) + + init() { + self = .unknown + } + + init?(rawValue: Int) { + switch rawValue { + case 0: self = .unknown + case 1: self = .path + case 2: self = .cmd + default: self = .UNRECOGNIZED(rawValue) + } + } + + var rawValue: Int { + switch self { + case .unknown: return 0 + case .path: return 1 + case .cmd: return 2 + case .UNRECOGNIZED(let i): return i + } + } + + } + + init() {} +} + +#if swift(>=4.2) + +extension Arrow_Flight_Protocol_FlightDescriptor.DescriptorType: CaseIterable { + // The compiler won't synthesize support with the UNRECOGNIZED case. + static var allCases: [Arrow_Flight_Protocol_FlightDescriptor.DescriptorType] = [ + .unknown, + .path, + .cmd, + ] +} + +#endif // swift(>=4.2) + +/// +/// The access coordinates for retrieval of a dataset. With a FlightInfo, a +/// consumer is able to determine how to retrieve a dataset. +struct Arrow_Flight_Protocol_FlightInfo { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + /// The schema of the dataset in its IPC form: + /// 4 bytes - an optional IPC_CONTINUATION_TOKEN prefix + /// 4 bytes - the byte length of the payload + /// a flatbuffer Message whose header is the Schema + var schema: Data = Data() + + /// + /// The descriptor associated with this info. + var flightDescriptor: Arrow_Flight_Protocol_FlightDescriptor { + get {return _flightDescriptor ?? Arrow_Flight_Protocol_FlightDescriptor()} + set {_flightDescriptor = newValue} + } + /// Returns true if `flightDescriptor` has been explicitly set. + var hasFlightDescriptor: Bool {return self._flightDescriptor != nil} + /// Clears the value of `flightDescriptor`. Subsequent reads from it will return its default value. + mutating func clearFlightDescriptor() {self._flightDescriptor = nil} + + /// + /// A list of endpoints associated with the flight. To consume the + /// whole flight, all endpoints (and hence all Tickets) must be + /// consumed. Endpoints can be consumed in any order. + /// + /// In other words, an application can use multiple endpoints to + /// represent partitioned data. + /// + /// If the returned data has an ordering, an application can use + /// "FlightInfo.ordered = true" or should return the all data in a + /// single endpoint. Otherwise, there is no ordering defined on + /// endpoints or the data within. + /// + /// A client can read ordered data by reading data from returned + /// endpoints, in order, from front to back. + /// + /// Note that a client may ignore "FlightInfo.ordered = true". If an + /// ordering is important for an application, an application must + /// choose one of them: + /// + /// * An application requires that all clients must read data in + /// returned endpoints order. + /// * An application must return the all data in a single endpoint. + var endpoint: [Arrow_Flight_Protocol_FlightEndpoint] = [] + + /// Set these to -1 if unknown. + var totalRecords: Int64 = 0 + + var totalBytes: Int64 = 0 + + /// + /// FlightEndpoints are in the same order as the data. + var ordered: Bool = false + + var unknownFields = SwiftProtobuf.UnknownStorage() + + init() {} + + fileprivate var _flightDescriptor: Arrow_Flight_Protocol_FlightDescriptor? = nil +} + +/// +/// A particular stream or split associated with a flight. +struct Arrow_Flight_Protocol_FlightEndpoint { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + /// + /// Token used to retrieve this stream. + var ticket: Arrow_Flight_Protocol_Ticket { + get {return _ticket ?? Arrow_Flight_Protocol_Ticket()} + set {_ticket = newValue} + } + /// Returns true if `ticket` has been explicitly set. + var hasTicket: Bool {return self._ticket != nil} + /// Clears the value of `ticket`. Subsequent reads from it will return its default value. + mutating func clearTicket() {self._ticket = nil} + + /// + /// A list of URIs where this ticket can be redeemed via DoGet(). + /// + /// If the list is empty, the expectation is that the ticket can only + /// be redeemed on the current service where the ticket was + /// generated. + /// + /// If the list is not empty, the expectation is that the ticket can + /// be redeemed at any of the locations, and that the data returned + /// will be equivalent. In this case, the ticket may only be redeemed + /// at one of the given locations, and not (necessarily) on the + /// current service. + /// + /// In other words, an application can use multiple locations to + /// represent redundant and/or load balanced services. + var location: [Arrow_Flight_Protocol_Location] = [] + + /// + /// Expiration time of this stream. If present, clients may assume + /// they can retry DoGet requests. Otherwise, it is + /// application-defined whether DoGet requests may be retried. + var expirationTime: SwiftProtobuf.Google_Protobuf_Timestamp { + get {return _expirationTime ?? SwiftProtobuf.Google_Protobuf_Timestamp()} + set {_expirationTime = newValue} + } + /// Returns true if `expirationTime` has been explicitly set. + var hasExpirationTime: Bool {return self._expirationTime != nil} + /// Clears the value of `expirationTime`. Subsequent reads from it will return its default value. + mutating func clearExpirationTime() {self._expirationTime = nil} + + var unknownFields = SwiftProtobuf.UnknownStorage() + + init() {} + + fileprivate var _ticket: Arrow_Flight_Protocol_Ticket? = nil + fileprivate var _expirationTime: SwiftProtobuf.Google_Protobuf_Timestamp? = nil +} + +/// +/// A location where a Flight service will accept retrieval of a particular +/// stream given a ticket. +struct Arrow_Flight_Protocol_Location { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + var uri: String = String() + + var unknownFields = SwiftProtobuf.UnknownStorage() + + init() {} +} + +/// +/// An opaque identifier that the service can use to retrieve a particular +/// portion of a stream. +/// +/// Tickets are meant to be single use. It is an error/application-defined +/// behavior to reuse a ticket. +struct Arrow_Flight_Protocol_Ticket { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + var ticket: Data = Data() + + var unknownFields = SwiftProtobuf.UnknownStorage() + + init() {} +} + +/// +/// A batch of Arrow data as part of a stream of batches. +struct Arrow_Flight_Protocol_FlightData { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + /// + /// The descriptor of the data. This is only relevant when a client is + /// starting a new DoPut stream. + var flightDescriptor: Arrow_Flight_Protocol_FlightDescriptor { + get {return _flightDescriptor ?? Arrow_Flight_Protocol_FlightDescriptor()} + set {_flightDescriptor = newValue} + } + /// Returns true if `flightDescriptor` has been explicitly set. + var hasFlightDescriptor: Bool {return self._flightDescriptor != nil} + /// Clears the value of `flightDescriptor`. Subsequent reads from it will return its default value. + mutating func clearFlightDescriptor() {self._flightDescriptor = nil} + + /// + /// Header for message data as described in Message.fbs::Message. + var dataHeader: Data = Data() + + /// + /// Application-defined metadata. + var appMetadata: Data = Data() + + /// + /// The actual batch of Arrow data. Preferably handled with minimal-copies + /// coming last in the definition to help with sidecar patterns (it is + /// expected that some implementations will fetch this field off the wire + /// with specialized code to avoid extra memory copies). + var dataBody: Data = Data() + + var unknownFields = SwiftProtobuf.UnknownStorage() + + init() {} + + fileprivate var _flightDescriptor: Arrow_Flight_Protocol_FlightDescriptor? = nil +} + +///* +/// The response message associated with the submission of a DoPut. +struct Arrow_Flight_Protocol_PutResult { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + var appMetadata: Data = Data() + + var unknownFields = SwiftProtobuf.UnknownStorage() + + init() {} +} + +#if swift(>=5.5) && canImport(_Concurrency) +extension Arrow_Flight_Protocol_CancelStatus: @unchecked Sendable {} +extension Arrow_Flight_Protocol_HandshakeRequest: @unchecked Sendable {} +extension Arrow_Flight_Protocol_HandshakeResponse: @unchecked Sendable {} +extension Arrow_Flight_Protocol_BasicAuth: @unchecked Sendable {} +extension Arrow_Flight_Protocol_Empty: @unchecked Sendable {} +extension Arrow_Flight_Protocol_ActionType: @unchecked Sendable {} +extension Arrow_Flight_Protocol_Criteria: @unchecked Sendable {} +extension Arrow_Flight_Protocol_Action: @unchecked Sendable {} +extension Arrow_Flight_Protocol_CancelFlightInfoRequest: @unchecked Sendable {} +extension Arrow_Flight_Protocol_RenewFlightEndpointRequest: @unchecked Sendable {} +extension Arrow_Flight_Protocol_Result: @unchecked Sendable {} +extension Arrow_Flight_Protocol_CancelFlightInfoResult: @unchecked Sendable {} +extension Arrow_Flight_Protocol_SchemaResult: @unchecked Sendable {} +extension Arrow_Flight_Protocol_FlightDescriptor: @unchecked Sendable {} +extension Arrow_Flight_Protocol_FlightDescriptor.DescriptorType: @unchecked Sendable {} +extension Arrow_Flight_Protocol_FlightInfo: @unchecked Sendable {} +extension Arrow_Flight_Protocol_FlightEndpoint: @unchecked Sendable {} +extension Arrow_Flight_Protocol_Location: @unchecked Sendable {} +extension Arrow_Flight_Protocol_Ticket: @unchecked Sendable {} +extension Arrow_Flight_Protocol_FlightData: @unchecked Sendable {} +extension Arrow_Flight_Protocol_PutResult: @unchecked Sendable {} +#endif // swift(>=5.5) && canImport(_Concurrency) + +// MARK: - Code below here is support for the SwiftProtobuf runtime. + +fileprivate let _protobuf_package = "arrow.flight.protocol" + +extension Arrow_Flight_Protocol_CancelStatus: SwiftProtobuf._ProtoNameProviding { + static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 0: .same(proto: "CANCEL_STATUS_UNSPECIFIED"), + 1: .same(proto: "CANCEL_STATUS_CANCELLED"), + 2: .same(proto: "CANCEL_STATUS_CANCELLING"), + 3: .same(proto: "CANCEL_STATUS_NOT_CANCELLABLE"), + ] +} + +extension Arrow_Flight_Protocol_HandshakeRequest: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + static let protoMessageName: String = _protobuf_package + ".HandshakeRequest" + static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 1: .standard(proto: "protocol_version"), + 2: .same(proto: "payload"), + ] + + mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularUInt64Field(value: &self.protocolVersion) }() + case 2: try { try decoder.decodeSingularBytesField(value: &self.payload) }() + default: break + } + } + } + + func traverse(visitor: inout V) throws { + if self.protocolVersion != 0 { + try visitor.visitSingularUInt64Field(value: self.protocolVersion, fieldNumber: 1) + } + if !self.payload.isEmpty { + try visitor.visitSingularBytesField(value: self.payload, fieldNumber: 2) + } + try unknownFields.traverse(visitor: &visitor) + } + + static func ==(lhs: Arrow_Flight_Protocol_HandshakeRequest, rhs: Arrow_Flight_Protocol_HandshakeRequest) -> Bool { + if lhs.protocolVersion != rhs.protocolVersion {return false} + if lhs.payload != rhs.payload {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Arrow_Flight_Protocol_HandshakeResponse: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + static let protoMessageName: String = _protobuf_package + ".HandshakeResponse" + static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 1: .standard(proto: "protocol_version"), + 2: .same(proto: "payload"), + ] + + mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularUInt64Field(value: &self.protocolVersion) }() + case 2: try { try decoder.decodeSingularBytesField(value: &self.payload) }() + default: break + } + } + } + + func traverse(visitor: inout V) throws { + if self.protocolVersion != 0 { + try visitor.visitSingularUInt64Field(value: self.protocolVersion, fieldNumber: 1) + } + if !self.payload.isEmpty { + try visitor.visitSingularBytesField(value: self.payload, fieldNumber: 2) + } + try unknownFields.traverse(visitor: &visitor) + } + + static func ==(lhs: Arrow_Flight_Protocol_HandshakeResponse, rhs: Arrow_Flight_Protocol_HandshakeResponse) -> Bool { + if lhs.protocolVersion != rhs.protocolVersion {return false} + if lhs.payload != rhs.payload {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Arrow_Flight_Protocol_BasicAuth: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + static let protoMessageName: String = _protobuf_package + ".BasicAuth" + static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 2: .same(proto: "username"), + 3: .same(proto: "password"), + ] + + mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 2: try { try decoder.decodeSingularStringField(value: &self.username) }() + case 3: try { try decoder.decodeSingularStringField(value: &self.password) }() + default: break + } + } + } + + func traverse(visitor: inout V) throws { + if !self.username.isEmpty { + try visitor.visitSingularStringField(value: self.username, fieldNumber: 2) + } + if !self.password.isEmpty { + try visitor.visitSingularStringField(value: self.password, fieldNumber: 3) + } + try unknownFields.traverse(visitor: &visitor) + } + + static func ==(lhs: Arrow_Flight_Protocol_BasicAuth, rhs: Arrow_Flight_Protocol_BasicAuth) -> Bool { + if lhs.username != rhs.username {return false} + if lhs.password != rhs.password {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Arrow_Flight_Protocol_Empty: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + static let protoMessageName: String = _protobuf_package + ".Empty" + static let _protobuf_nameMap = SwiftProtobuf._NameMap() + + mutating func decodeMessage(decoder: inout D) throws { + while let _ = try decoder.nextFieldNumber() { + } + } + + func traverse(visitor: inout V) throws { + try unknownFields.traverse(visitor: &visitor) + } + + static func ==(lhs: Arrow_Flight_Protocol_Empty, rhs: Arrow_Flight_Protocol_Empty) -> Bool { + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Arrow_Flight_Protocol_ActionType: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + static let protoMessageName: String = _protobuf_package + ".ActionType" + static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 1: .same(proto: "type"), + 2: .same(proto: "description"), + ] + + mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularStringField(value: &self.type) }() + case 2: try { try decoder.decodeSingularStringField(value: &self.description_p) }() + default: break + } + } + } + + func traverse(visitor: inout V) throws { + if !self.type.isEmpty { + try visitor.visitSingularStringField(value: self.type, fieldNumber: 1) + } + if !self.description_p.isEmpty { + try visitor.visitSingularStringField(value: self.description_p, fieldNumber: 2) + } + try unknownFields.traverse(visitor: &visitor) + } + + static func ==(lhs: Arrow_Flight_Protocol_ActionType, rhs: Arrow_Flight_Protocol_ActionType) -> Bool { + if lhs.type != rhs.type {return false} + if lhs.description_p != rhs.description_p {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Arrow_Flight_Protocol_Criteria: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + static let protoMessageName: String = _protobuf_package + ".Criteria" + static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 1: .same(proto: "expression"), + ] + + mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularBytesField(value: &self.expression) }() + default: break + } + } + } + + func traverse(visitor: inout V) throws { + if !self.expression.isEmpty { + try visitor.visitSingularBytesField(value: self.expression, fieldNumber: 1) + } + try unknownFields.traverse(visitor: &visitor) + } + + static func ==(lhs: Arrow_Flight_Protocol_Criteria, rhs: Arrow_Flight_Protocol_Criteria) -> Bool { + if lhs.expression != rhs.expression {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Arrow_Flight_Protocol_Action: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + static let protoMessageName: String = _protobuf_package + ".Action" + static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 1: .same(proto: "type"), + 2: .same(proto: "body"), + ] + + mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularStringField(value: &self.type) }() + case 2: try { try decoder.decodeSingularBytesField(value: &self.body) }() + default: break + } + } + } + + func traverse(visitor: inout V) throws { + if !self.type.isEmpty { + try visitor.visitSingularStringField(value: self.type, fieldNumber: 1) + } + if !self.body.isEmpty { + try visitor.visitSingularBytesField(value: self.body, fieldNumber: 2) + } + try unknownFields.traverse(visitor: &visitor) + } + + static func ==(lhs: Arrow_Flight_Protocol_Action, rhs: Arrow_Flight_Protocol_Action) -> Bool { + if lhs.type != rhs.type {return false} + if lhs.body != rhs.body {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Arrow_Flight_Protocol_CancelFlightInfoRequest: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + static let protoMessageName: String = _protobuf_package + ".CancelFlightInfoRequest" + static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 1: .same(proto: "info"), + ] + + mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularMessageField(value: &self._info) }() + default: break + } + } + } + + func traverse(visitor: inout V) throws { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every if/case branch local when no optimizations + // are enabled. https://github.com/apple/swift-protobuf/issues/1034 and + // https://github.com/apple/swift-protobuf/issues/1182 + try { if let v = self._info { + try visitor.visitSingularMessageField(value: v, fieldNumber: 1) + } }() + try unknownFields.traverse(visitor: &visitor) + } + + static func ==(lhs: Arrow_Flight_Protocol_CancelFlightInfoRequest, rhs: Arrow_Flight_Protocol_CancelFlightInfoRequest) -> Bool { + if lhs._info != rhs._info {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Arrow_Flight_Protocol_RenewFlightEndpointRequest: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + static let protoMessageName: String = _protobuf_package + ".RenewFlightEndpointRequest" + static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 1: .same(proto: "endpoint"), + ] + + mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularMessageField(value: &self._endpoint) }() + default: break + } + } + } + + func traverse(visitor: inout V) throws { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every if/case branch local when no optimizations + // are enabled. https://github.com/apple/swift-protobuf/issues/1034 and + // https://github.com/apple/swift-protobuf/issues/1182 + try { if let v = self._endpoint { + try visitor.visitSingularMessageField(value: v, fieldNumber: 1) + } }() + try unknownFields.traverse(visitor: &visitor) + } + + static func ==(lhs: Arrow_Flight_Protocol_RenewFlightEndpointRequest, rhs: Arrow_Flight_Protocol_RenewFlightEndpointRequest) -> Bool { + if lhs._endpoint != rhs._endpoint {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Arrow_Flight_Protocol_Result: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + static let protoMessageName: String = _protobuf_package + ".Result" + static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 1: .same(proto: "body"), + ] + + mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularBytesField(value: &self.body) }() + default: break + } + } + } + + func traverse(visitor: inout V) throws { + if !self.body.isEmpty { + try visitor.visitSingularBytesField(value: self.body, fieldNumber: 1) + } + try unknownFields.traverse(visitor: &visitor) + } + + static func ==(lhs: Arrow_Flight_Protocol_Result, rhs: Arrow_Flight_Protocol_Result) -> Bool { + if lhs.body != rhs.body {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Arrow_Flight_Protocol_CancelFlightInfoResult: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + static let protoMessageName: String = _protobuf_package + ".CancelFlightInfoResult" + static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 1: .same(proto: "status"), + ] + + mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularEnumField(value: &self.status) }() + default: break + } + } + } + + func traverse(visitor: inout V) throws { + if self.status != .unspecified { + try visitor.visitSingularEnumField(value: self.status, fieldNumber: 1) + } + try unknownFields.traverse(visitor: &visitor) + } + + static func ==(lhs: Arrow_Flight_Protocol_CancelFlightInfoResult, rhs: Arrow_Flight_Protocol_CancelFlightInfoResult) -> Bool { + if lhs.status != rhs.status {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Arrow_Flight_Protocol_SchemaResult: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + static let protoMessageName: String = _protobuf_package + ".SchemaResult" + static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 1: .same(proto: "schema"), + ] + + mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularBytesField(value: &self.schema) }() + default: break + } + } + } + + func traverse(visitor: inout V) throws { + if !self.schema.isEmpty { + try visitor.visitSingularBytesField(value: self.schema, fieldNumber: 1) + } + try unknownFields.traverse(visitor: &visitor) + } + + static func ==(lhs: Arrow_Flight_Protocol_SchemaResult, rhs: Arrow_Flight_Protocol_SchemaResult) -> Bool { + if lhs.schema != rhs.schema {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Arrow_Flight_Protocol_FlightDescriptor: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + static let protoMessageName: String = _protobuf_package + ".FlightDescriptor" + static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 1: .same(proto: "type"), + 2: .same(proto: "cmd"), + 3: .same(proto: "path"), + ] + + mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularEnumField(value: &self.type) }() + case 2: try { try decoder.decodeSingularBytesField(value: &self.cmd) }() + case 3: try { try decoder.decodeRepeatedStringField(value: &self.path) }() + default: break + } + } + } + + func traverse(visitor: inout V) throws { + if self.type != .unknown { + try visitor.visitSingularEnumField(value: self.type, fieldNumber: 1) + } + if !self.cmd.isEmpty { + try visitor.visitSingularBytesField(value: self.cmd, fieldNumber: 2) + } + if !self.path.isEmpty { + try visitor.visitRepeatedStringField(value: self.path, fieldNumber: 3) + } + try unknownFields.traverse(visitor: &visitor) + } + + static func ==(lhs: Arrow_Flight_Protocol_FlightDescriptor, rhs: Arrow_Flight_Protocol_FlightDescriptor) -> Bool { + if lhs.type != rhs.type {return false} + if lhs.cmd != rhs.cmd {return false} + if lhs.path != rhs.path {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Arrow_Flight_Protocol_FlightDescriptor.DescriptorType: SwiftProtobuf._ProtoNameProviding { + static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 0: .same(proto: "UNKNOWN"), + 1: .same(proto: "PATH"), + 2: .same(proto: "CMD"), + ] +} + +extension Arrow_Flight_Protocol_FlightInfo: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + static let protoMessageName: String = _protobuf_package + ".FlightInfo" + static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 1: .same(proto: "schema"), + 2: .standard(proto: "flight_descriptor"), + 3: .same(proto: "endpoint"), + 4: .standard(proto: "total_records"), + 5: .standard(proto: "total_bytes"), + 6: .same(proto: "ordered"), + ] + + mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularBytesField(value: &self.schema) }() + case 2: try { try decoder.decodeSingularMessageField(value: &self._flightDescriptor) }() + case 3: try { try decoder.decodeRepeatedMessageField(value: &self.endpoint) }() + case 4: try { try decoder.decodeSingularInt64Field(value: &self.totalRecords) }() + case 5: try { try decoder.decodeSingularInt64Field(value: &self.totalBytes) }() + case 6: try { try decoder.decodeSingularBoolField(value: &self.ordered) }() + default: break + } + } + } + + func traverse(visitor: inout V) throws { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every if/case branch local when no optimizations + // are enabled. https://github.com/apple/swift-protobuf/issues/1034 and + // https://github.com/apple/swift-protobuf/issues/1182 + if !self.schema.isEmpty { + try visitor.visitSingularBytesField(value: self.schema, fieldNumber: 1) + } + try { if let v = self._flightDescriptor { + try visitor.visitSingularMessageField(value: v, fieldNumber: 2) + } }() + if !self.endpoint.isEmpty { + try visitor.visitRepeatedMessageField(value: self.endpoint, fieldNumber: 3) + } + if self.totalRecords != 0 { + try visitor.visitSingularInt64Field(value: self.totalRecords, fieldNumber: 4) + } + if self.totalBytes != 0 { + try visitor.visitSingularInt64Field(value: self.totalBytes, fieldNumber: 5) + } + if self.ordered != false { + try visitor.visitSingularBoolField(value: self.ordered, fieldNumber: 6) + } + try unknownFields.traverse(visitor: &visitor) + } + + static func ==(lhs: Arrow_Flight_Protocol_FlightInfo, rhs: Arrow_Flight_Protocol_FlightInfo) -> Bool { + if lhs.schema != rhs.schema {return false} + if lhs._flightDescriptor != rhs._flightDescriptor {return false} + if lhs.endpoint != rhs.endpoint {return false} + if lhs.totalRecords != rhs.totalRecords {return false} + if lhs.totalBytes != rhs.totalBytes {return false} + if lhs.ordered != rhs.ordered {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Arrow_Flight_Protocol_FlightEndpoint: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + static let protoMessageName: String = _protobuf_package + ".FlightEndpoint" + static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 1: .same(proto: "ticket"), + 2: .same(proto: "location"), + 3: .standard(proto: "expiration_time"), + ] + + mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularMessageField(value: &self._ticket) }() + case 2: try { try decoder.decodeRepeatedMessageField(value: &self.location) }() + case 3: try { try decoder.decodeSingularMessageField(value: &self._expirationTime) }() + default: break + } + } + } + + func traverse(visitor: inout V) throws { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every if/case branch local when no optimizations + // are enabled. https://github.com/apple/swift-protobuf/issues/1034 and + // https://github.com/apple/swift-protobuf/issues/1182 + try { if let v = self._ticket { + try visitor.visitSingularMessageField(value: v, fieldNumber: 1) + } }() + if !self.location.isEmpty { + try visitor.visitRepeatedMessageField(value: self.location, fieldNumber: 2) + } + try { if let v = self._expirationTime { + try visitor.visitSingularMessageField(value: v, fieldNumber: 3) + } }() + try unknownFields.traverse(visitor: &visitor) + } + + static func ==(lhs: Arrow_Flight_Protocol_FlightEndpoint, rhs: Arrow_Flight_Protocol_FlightEndpoint) -> Bool { + if lhs._ticket != rhs._ticket {return false} + if lhs.location != rhs.location {return false} + if lhs._expirationTime != rhs._expirationTime {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Arrow_Flight_Protocol_Location: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + static let protoMessageName: String = _protobuf_package + ".Location" + static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 1: .same(proto: "uri"), + ] + + mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularStringField(value: &self.uri) }() + default: break + } + } + } + + func traverse(visitor: inout V) throws { + if !self.uri.isEmpty { + try visitor.visitSingularStringField(value: self.uri, fieldNumber: 1) + } + try unknownFields.traverse(visitor: &visitor) + } + + static func ==(lhs: Arrow_Flight_Protocol_Location, rhs: Arrow_Flight_Protocol_Location) -> Bool { + if lhs.uri != rhs.uri {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Arrow_Flight_Protocol_Ticket: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + static let protoMessageName: String = _protobuf_package + ".Ticket" + static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 1: .same(proto: "ticket"), + ] + + mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularBytesField(value: &self.ticket) }() + default: break + } + } + } + + func traverse(visitor: inout V) throws { + if !self.ticket.isEmpty { + try visitor.visitSingularBytesField(value: self.ticket, fieldNumber: 1) + } + try unknownFields.traverse(visitor: &visitor) + } + + static func ==(lhs: Arrow_Flight_Protocol_Ticket, rhs: Arrow_Flight_Protocol_Ticket) -> Bool { + if lhs.ticket != rhs.ticket {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Arrow_Flight_Protocol_FlightData: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + static let protoMessageName: String = _protobuf_package + ".FlightData" + static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 1: .standard(proto: "flight_descriptor"), + 2: .standard(proto: "data_header"), + 3: .standard(proto: "app_metadata"), + 1000: .standard(proto: "data_body"), + ] + + mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularMessageField(value: &self._flightDescriptor) }() + case 2: try { try decoder.decodeSingularBytesField(value: &self.dataHeader) }() + case 3: try { try decoder.decodeSingularBytesField(value: &self.appMetadata) }() + case 1000: try { try decoder.decodeSingularBytesField(value: &self.dataBody) }() + default: break + } + } + } + + func traverse(visitor: inout V) throws { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every if/case branch local when no optimizations + // are enabled. https://github.com/apple/swift-protobuf/issues/1034 and + // https://github.com/apple/swift-protobuf/issues/1182 + try { if let v = self._flightDescriptor { + try visitor.visitSingularMessageField(value: v, fieldNumber: 1) + } }() + if !self.dataHeader.isEmpty { + try visitor.visitSingularBytesField(value: self.dataHeader, fieldNumber: 2) + } + if !self.appMetadata.isEmpty { + try visitor.visitSingularBytesField(value: self.appMetadata, fieldNumber: 3) + } + if !self.dataBody.isEmpty { + try visitor.visitSingularBytesField(value: self.dataBody, fieldNumber: 1000) + } + try unknownFields.traverse(visitor: &visitor) + } + + static func ==(lhs: Arrow_Flight_Protocol_FlightData, rhs: Arrow_Flight_Protocol_FlightData) -> Bool { + if lhs._flightDescriptor != rhs._flightDescriptor {return false} + if lhs.dataHeader != rhs.dataHeader {return false} + if lhs.appMetadata != rhs.appMetadata {return false} + if lhs.dataBody != rhs.dataBody {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Arrow_Flight_Protocol_PutResult: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + static let protoMessageName: String = _protobuf_package + ".PutResult" + static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 1: .standard(proto: "app_metadata"), + ] + + mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularBytesField(value: &self.appMetadata) }() + default: break + } + } + } + + func traverse(visitor: inout V) throws { + if !self.appMetadata.isEmpty { + try visitor.visitSingularBytesField(value: self.appMetadata, fieldNumber: 1) + } + try unknownFields.traverse(visitor: &visitor) + } + + static func ==(lhs: Arrow_Flight_Protocol_PutResult, rhs: Arrow_Flight_Protocol_PutResult) -> Bool { + if lhs.appMetadata != rhs.appMetadata {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} diff --git a/swift/ArrowFlight/Sources/ArrowFlight/FlightAction.swift b/swift/ArrowFlight/Sources/ArrowFlight/FlightAction.swift new file mode 100644 index 0000000000000..04e917d474cff --- /dev/null +++ b/swift/ArrowFlight/Sources/ArrowFlight/FlightAction.swift @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import Foundation + +public class FlightAction { + public let type: String + public let body: Data + init(_ action: Arrow_Flight_Protocol_Action) { + self.type = action.type + self.body = action.body + } + + public init(_ type: String, body: Data = Data()) { + self.type = type; + self.body = body; + } + + func toProtocol() -> Arrow_Flight_Protocol_Action { + var flight_action = Arrow_Flight_Protocol_Action() + flight_action.type = self.type + flight_action.body = self.body + return flight_action + } +} diff --git a/swift/ArrowFlight/Sources/ArrowFlight/FlightActionType.swift b/swift/ArrowFlight/Sources/ArrowFlight/FlightActionType.swift new file mode 100644 index 0000000000000..b3b06793feade --- /dev/null +++ b/swift/ArrowFlight/Sources/ArrowFlight/FlightActionType.swift @@ -0,0 +1,38 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import Foundation +public class FlightActionType { + public let type: String + public let description: String + init(_ actionType: Arrow_Flight_Protocol_ActionType) { + self.type = actionType.type + self.description = actionType.description_p + + } + public init(_ type: String, description: String) { + self.type = type + self.description = description + } + + func toProtocol() -> Arrow_Flight_Protocol_ActionType { + var actionType = Arrow_Flight_Protocol_ActionType() + actionType.type = self.type + actionType.description_p = self.description + return actionType + } +} diff --git a/swift/ArrowFlight/Sources/ArrowFlight/FlightClient.swift b/swift/ArrowFlight/Sources/ArrowFlight/FlightClient.swift new file mode 100644 index 0000000000000..f7b8564af31d7 --- /dev/null +++ b/swift/ArrowFlight/Sources/ArrowFlight/FlightClient.swift @@ -0,0 +1,144 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import struct Foundation.Data +import struct Foundation.URL +import GRPC +import NIOCore +import NIOPosix +import Arrow + +public class FlightClient { + let client: Arrow_Flight_Protocol_FlightServiceAsyncClient + public init(channel: GRPCChannel) { + client = Arrow_Flight_Protocol_FlightServiceAsyncClient(channel: channel) + } + + public func listActions(_ closure: (FlightActionType) -> Void) async throws { + let listActions = client.makeListActionsCall(Arrow_Flight_Protocol_Empty()) + for try await data in listActions.responseStream { + closure(FlightActionType(data)) + } + } + + public func listFlights(_ criteria :FlightCriteria, closure: (FlightInfo) throws -> Void) async throws { + let listFlights = client.makeListFlightsCall(criteria.toProtocol()) + for try await data in listFlights.responseStream { + try closure(FlightInfo(data)); + } + } + + + public func doAction(_ action: FlightAction, closure: (FlightResult) throws -> Void) async throws { + let actionResponse = client.makeDoActionCall(action.toProtocol()) + for try await data in actionResponse.responseStream { + try closure(FlightResult(data)); + } + } + + public func getSchema(_ descriptor: FlightDescriptor) async throws -> FlightSchemaResult { + let schemaResultResponse = client.makeGetSchemaCall(descriptor.toProtocol()) + return FlightSchemaResult(try await schemaResultResponse.response) + } + + public func doGet(_ ticket: FlightTicket, readerResultClosure: (ArrowReader.ArrowReaderResult) throws -> Void) async throws { + let getResult = client.makeDoGetCall(ticket.toProtocol()) + let reader = ArrowReader() + for try await data in getResult.responseStream { + switch reader.fromStream(data.dataBody) { + case .success(let rb): + try readerResultClosure(rb) + case .failure(let error): + throw error + } + } + } + + public func doGet(_ ticket: FlightTicket, flightDataClosure: (FlightData) throws -> Void) async throws { + let getResult = client.makeDoGetCall(ticket.toProtocol()) + for try await data in getResult.responseStream { + try flightDataClosure(FlightData(data)) + } + } + + public func doPut(_ recordBatchs: [RecordBatch], closure: (FlightPutResult) throws -> Void) async throws { + if recordBatchs.isEmpty { + throw ArrowFlightError.EmptyCollection + } + + let putCall = client.makeDoPutCall() + let writer = ArrowWriter() + let writerInfo = ArrowWriter.Info(.recordbatch, schema: recordBatchs[0].schema, batches: recordBatchs) + switch writer.toStream(writerInfo) { + case .success(let data): + try await putCall.requestStream.send(FlightData(data).toProtocol()) + putCall.requestStream.finish() + for try await response in putCall.responseStream { + try closure(FlightPutResult(response)) + } + case .failure(let error): + throw error + } + } + + public func doPut(flightData: FlightData, closure: (FlightPutResult) throws -> Void) async throws { + let putCall = client.makeDoPutCall() + try await putCall.requestStream.send(flightData.toProtocol()) + putCall.requestStream.finish() + for try await response in putCall.responseStream { + try closure(FlightPutResult(response)) + } + } + + public func doExchange(_ recordBatchs: [RecordBatch], closure: (ArrowReader.ArrowReaderResult) throws -> Void) async throws { + if recordBatchs.isEmpty { + throw ArrowFlightError.EmptyCollection + } + + let exchangeCall = client.makeDoExchangeCall() + let writer = ArrowWriter() + let info = ArrowWriter.Info(.recordbatch, schema: recordBatchs[0].schema, batches: recordBatchs) + switch writer.toStream(info) { + case .success(let data): + let request = Arrow_Flight_Protocol_FlightData.with { + $0.dataBody = data + } + try await exchangeCall.requestStream.send(request) + exchangeCall.requestStream.finish() + let reader = ArrowReader() + for try await response in exchangeCall.responseStream { + switch reader.fromStream(response.dataBody) { + case .success(let rbResult): + try closure(rbResult) + case .failure(let error): + throw error + } + } + case .failure(let error): + throw error + } + } + + public func doExchange(fligthData: FlightData, closure: (FlightData) throws -> Void) async throws { + let exchangeCall = client.makeDoExchangeCall() + try await exchangeCall.requestStream.send(fligthData.toProtocol()) + exchangeCall.requestStream.finish() + for try await response in exchangeCall.responseStream { + try closure(FlightData(response)) + } + } +} diff --git a/swift/ArrowFlight/Sources/ArrowFlight/FlightCriteria.swift b/swift/ArrowFlight/Sources/ArrowFlight/FlightCriteria.swift new file mode 100644 index 0000000000000..a887a22ad1737 --- /dev/null +++ b/swift/ArrowFlight/Sources/ArrowFlight/FlightCriteria.swift @@ -0,0 +1,37 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import Foundation + +public class FlightCriteria { + let criteria: Arrow_Flight_Protocol_Criteria + + public var expression: Data { criteria.expression } + public init(_ expression: Data = Data()) { + criteria = Arrow_Flight_Protocol_Criteria.with { + $0.expression = expression + } + } + + init(_ criteria: Arrow_Flight_Protocol_Criteria) { + self.criteria = criteria + } + + func toProtocol() -> Arrow_Flight_Protocol_Criteria { + return criteria + } +} diff --git a/swift/ArrowFlight/Sources/ArrowFlight/FlightData.swift b/swift/ArrowFlight/Sources/ArrowFlight/FlightData.swift new file mode 100644 index 0000000000000..004fb785f0c11 --- /dev/null +++ b/swift/ArrowFlight/Sources/ArrowFlight/FlightData.swift @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import Foundation + +public class FlightData { + let flight_data: Arrow_Flight_Protocol_FlightData + public var flightDescriptor: FlightDescriptor? { + get { return flight_data.hasFlightDescriptor ? FlightDescriptor(flight_data.flightDescriptor) : nil } + } + + public var dataBody: Data { flight_data.dataBody } + + init(_ flight_data: Arrow_Flight_Protocol_FlightData) { + self.flight_data = flight_data + } + + public init(_ dataBody: Data, flightDescriptor: FlightDescriptor? = nil) { + if flightDescriptor != nil { + self.flight_data = Arrow_Flight_Protocol_FlightData.with { + $0.dataBody = dataBody + $0.flightDescriptor = flightDescriptor!.toProtocol() + } + } else { + self.flight_data = Arrow_Flight_Protocol_FlightData.with { + $0.dataBody = dataBody + } + } + } + + func toProtocol() -> Arrow_Flight_Protocol_FlightData { self.flight_data } +} diff --git a/swift/ArrowFlight/Sources/ArrowFlight/FlightDescriptor.swift b/swift/ArrowFlight/Sources/ArrowFlight/FlightDescriptor.swift new file mode 100644 index 0000000000000..68bc91a3deda1 --- /dev/null +++ b/swift/ArrowFlight/Sources/ArrowFlight/FlightDescriptor.swift @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import Foundation + +public class FlightDescriptor { + public enum type { + case unknown + case path + case cmd + } + + public let type: FlightDescriptor.type + public let cmd: Data + public let paths: [String] + + init(_ descriptor: Arrow_Flight_Protocol_FlightDescriptor) { + self.type = descriptor.type == .cmd ? .cmd : .path + self.cmd = descriptor.cmd + self.paths = descriptor.path + } + + public init(cmd: Data) { + self.type = .cmd + self.cmd = cmd + self.paths = [String]() + } + + public init(paths: [String]) { + self.type = .path + self.cmd = Data() + self.paths = paths + } + + func toProtocol() -> Arrow_Flight_Protocol_FlightDescriptor { + var descriptor = Arrow_Flight_Protocol_FlightDescriptor() + descriptor.type = self.type == .cmd ? .cmd : .path + descriptor.cmd = self.cmd + descriptor.path = self.paths + return descriptor + } +} diff --git a/swift/ArrowFlight/Sources/ArrowFlight/FlightEndpoint.swift b/swift/ArrowFlight/Sources/ArrowFlight/FlightEndpoint.swift new file mode 100644 index 0000000000000..7c40a2a157ae8 --- /dev/null +++ b/swift/ArrowFlight/Sources/ArrowFlight/FlightEndpoint.swift @@ -0,0 +1,38 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import Foundation +public class FlightEndpoint { + let ticket: FlightTicket; + let locations: [FlightLocation] + init(_ endpoint: Arrow_Flight_Protocol_FlightEndpoint) { + self.ticket = FlightTicket(endpoint.ticket.ticket) + self.locations = endpoint.location.map {return FlightLocation($0)} + } + + public init(_ ticket: FlightTicket, locations: [FlightLocation]) { + self.ticket = ticket + self.locations = locations; + } + + func toProtocol() -> Arrow_Flight_Protocol_FlightEndpoint { + var endpoint = Arrow_Flight_Protocol_FlightEndpoint() + endpoint.ticket = self.ticket.toProtocol() + endpoint.location = self.locations.map { $0.toProtocol() } + return endpoint + } +} diff --git a/swift/ArrowFlight/Sources/ArrowFlight/FlightInfo.swift b/swift/ArrowFlight/Sources/ArrowFlight/FlightInfo.swift new file mode 100644 index 0000000000000..b370c00db3d42 --- /dev/null +++ b/swift/ArrowFlight/Sources/ArrowFlight/FlightInfo.swift @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import Foundation +import Arrow + +public class FlightInfo { + let flight_info: Arrow_Flight_Protocol_FlightInfo + public var flightDescriptor: FlightDescriptor? { + get { return flight_info.hasFlightDescriptor ? FlightDescriptor(flight_info.flightDescriptor) : nil } + } + + public var endpoints: [FlightEndpoint] { + return self.flight_info.endpoint.map { FlightEndpoint($0) } + } + public var schema: Data { flight_info.schema } + + var endpoint: [Arrow_Flight_Protocol_FlightEndpoint] = [] + init(_ flight_info: Arrow_Flight_Protocol_FlightInfo) { + self.flight_info = flight_info + } + + public init(_ schema: Data, endpoints: [FlightEndpoint] = [FlightEndpoint](), descriptor: FlightDescriptor? = nil) { + if let localDescriptor = descriptor { + self.flight_info = Arrow_Flight_Protocol_FlightInfo.with { + $0.schema = schema + $0.flightDescriptor = localDescriptor.toProtocol() + $0.endpoint = endpoints.map { $0.toProtocol() } + } + } else { + self.flight_info = Arrow_Flight_Protocol_FlightInfo.with { + $0.schema = schema + $0.endpoint = endpoints.map { $0.toProtocol() } + } + } + } + + func toProtocol() -> Arrow_Flight_Protocol_FlightInfo { + return self.flight_info + } +} diff --git a/swift/ArrowFlight/Sources/ArrowFlight/FlightLocation.swift b/swift/ArrowFlight/Sources/ArrowFlight/FlightLocation.swift new file mode 100644 index 0000000000000..b87671c903d44 --- /dev/null +++ b/swift/ArrowFlight/Sources/ArrowFlight/FlightLocation.swift @@ -0,0 +1,36 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import Foundation + +public class FlightLocation { + public let uri: String + + init(_ location: Arrow_Flight_Protocol_Location) { + self.uri = location.uri + } + + public init(_ uri: String) { + self.uri = uri; + } + + func toProtocol() -> Arrow_Flight_Protocol_Location { + var location = Arrow_Flight_Protocol_Location() + location.uri = uri + return location + } +} diff --git a/swift/ArrowFlight/Sources/ArrowFlight/FlightPutResult.swift b/swift/ArrowFlight/Sources/ArrowFlight/FlightPutResult.swift new file mode 100644 index 0000000000000..bf73c716e39c0 --- /dev/null +++ b/swift/ArrowFlight/Sources/ArrowFlight/FlightPutResult.swift @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import Foundation + +public class FlightPutResult { + public let appMetadata: Data + public init(_ appMetadata: Data = Data()) { + self.appMetadata = appMetadata + } + + init(_ putResult: Arrow_Flight_Protocol_PutResult) { + self.appMetadata = putResult.appMetadata + } + + func toProtocol() -> Arrow_Flight_Protocol_PutResult { + var putResult = Arrow_Flight_Protocol_PutResult() + putResult.appMetadata = self.appMetadata + return putResult + } +} diff --git a/swift/ArrowFlight/Sources/ArrowFlight/FlightResult.swift b/swift/ArrowFlight/Sources/ArrowFlight/FlightResult.swift new file mode 100644 index 0000000000000..ba55bede7c70c --- /dev/null +++ b/swift/ArrowFlight/Sources/ArrowFlight/FlightResult.swift @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import Foundation + +public class FlightResult { + public let body: Data + init(_ result: Arrow_Flight_Protocol_Result) { + self.body = result.body + } + + public init(_ body: Data) { + self.body = body + } + + func toProtocol() -> Arrow_Flight_Protocol_Result { + var result = Arrow_Flight_Protocol_Result() + result.body = self.body + return result + } +} diff --git a/swift/ArrowFlight/Sources/ArrowFlight/FlightSchemaResult.swift b/swift/ArrowFlight/Sources/ArrowFlight/FlightSchemaResult.swift new file mode 100644 index 0000000000000..8d5323b731ea8 --- /dev/null +++ b/swift/ArrowFlight/Sources/ArrowFlight/FlightSchemaResult.swift @@ -0,0 +1,37 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import Foundation + +public class FlightSchemaResult { + let schemaResult: Arrow_Flight_Protocol_SchemaResult + + public var schema: Data { schemaResult.schema } + public init(_ schema: Data) { + self.schemaResult = Arrow_Flight_Protocol_SchemaResult.with { + $0.schema = schema + } + } + + init(_ schemaResult: Arrow_Flight_Protocol_SchemaResult) { + self.schemaResult = schemaResult + } + + func toProtocol() -> Arrow_Flight_Protocol_SchemaResult { + return schemaResult + } +} diff --git a/swift/ArrowFlight/Sources/ArrowFlight/FlightServer.swift b/swift/ArrowFlight/Sources/ArrowFlight/FlightServer.swift new file mode 100644 index 0000000000000..f67f612b0bcb4 --- /dev/null +++ b/swift/ArrowFlight/Sources/ArrowFlight/FlightServer.swift @@ -0,0 +1,162 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import Foundation +import GRPC +import NIO +import NIOConcurrencyHelpers +import SwiftProtobuf +import Arrow + +public enum ArrowFlightError: Error { + case Unknown(String?) + case NotImplemented(String? = nil) + case EmptyCollection + case IOError(String? = nil) +} + +public func schemaToArrowStream(_ schema: ArrowSchema) throws -> Data { + let arrowWriter = ArrowWriter() + switch arrowWriter.toStream(ArrowWriter.Info(.schema, schema: schema)) { + case .success(let result): + return result + case .failure(let error): + throw error + } +} + +public func streamToArrowSchema(_ schema: Data) throws -> ArrowSchema { + let schemaResult = ArrowReader().fromStream(schema) + switch schemaResult { + case .success(let result): + if let retSchema = result.schema { + return retSchema + } + + throw ArrowFlightError.IOError("Schema not found") + case .failure(let error): + throw error + } +} + +public protocol ArrowFlightServer : Sendable { + func listFlights(_ criteria: FlightCriteria, writer: FlightInfoStreamWriter) async throws + func getFlightInfo(_ request: FlightDescriptor) async throws -> FlightInfo + func getSchema(_ request: FlightDescriptor) async throws -> ArrowFlight.FlightSchemaResult + func listActions(_ writer: ActionTypeStreamWriter) async throws + func doAction(_ action: FlightAction, writer: ResultStreamWriter) async throws + func doGet(_ ticket: FlightTicket, writer: RecordBatchStreamWriter) async throws; + func doPut(_ reader: RecordBatchStreamReader, writer: PutResultDataStreamWriter) async throws + func doExchange(_ reader: RecordBatchStreamReader, writer: RecordBatchStreamWriter) async throws +} + +public func MakeFlightServer(_ handler: ArrowFlightServer) -> CallHandlerProvider { + return InternalFlightServer(handler) +} + +internal final class InternalFlightServer : Arrow_Flight_Protocol_FlightServiceAsyncProvider { + let arrowFlightServer: ArrowFlightServer? + + init(_ arrowFlightServer: ArrowFlightServer?) { + self.arrowFlightServer = arrowFlightServer + } + + func handshake(requestStream: GRPC.GRPCAsyncRequestStream, responseStream: GRPC.GRPCAsyncResponseStreamWriter, context: GRPC.GRPCAsyncServerCallContext) async throws { + throw ArrowFlightError.NotImplemented() + } + + func listFlights(request: Arrow_Flight_Protocol_Criteria, responseStream: GRPC.GRPCAsyncResponseStreamWriter, context: GRPC.GRPCAsyncServerCallContext) async throws { + if let server = arrowFlightServer { + let writer = FlightInfoStreamWriter(responseStream) + try await server.listFlights(FlightCriteria(request), writer: writer) + return + } + + throw ArrowFlightError.NotImplemented() + } + + func getFlightInfo(request: Arrow_Flight_Protocol_FlightDescriptor, context: GRPC.GRPCAsyncServerCallContext) async throws -> Arrow_Flight_Protocol_FlightInfo { + if let server = arrowFlightServer { + return try await server.getFlightInfo(FlightDescriptor(request)).toProtocol() + } + + throw ArrowFlightError.NotImplemented() + } + + func getSchema(request: Arrow_Flight_Protocol_FlightDescriptor, context: GRPC.GRPCAsyncServerCallContext) async throws -> Arrow_Flight_Protocol_SchemaResult { + if let server = arrowFlightServer { + return try await server.getSchema(FlightDescriptor(request)).toProtocol() + } + + throw ArrowFlightError.NotImplemented() + } + + func doGet(request: Arrow_Flight_Protocol_Ticket, responseStream: GRPC.GRPCAsyncResponseStreamWriter, context: GRPC.GRPCAsyncServerCallContext) async throws { + if let server = arrowFlightServer { + let writer = RecordBatchStreamWriter(responseStream) + let ticket = FlightTicket(request) + try await server.doGet(ticket, writer: writer) + return + } + + throw ArrowFlightError.NotImplemented() + } + + func doPut(requestStream: GRPC.GRPCAsyncRequestStream, responseStream: GRPC.GRPCAsyncResponseStreamWriter, context: GRPC.GRPCAsyncServerCallContext) async throws { + if let server = arrowFlightServer { + let reader = RecordBatchStreamReader(requestStream) + let writer = PutResultDataStreamWriter(responseStream) + try await server.doPut(reader, writer: writer) + return + } + + throw ArrowFlightError.NotImplemented() + } + + func doExchange(requestStream: GRPC.GRPCAsyncRequestStream, responseStream: GRPC.GRPCAsyncResponseStreamWriter, context: GRPC.GRPCAsyncServerCallContext) async throws { + if let server = arrowFlightServer { + let reader = RecordBatchStreamReader(requestStream) + let writer = RecordBatchStreamWriter(responseStream) + try await server.doExchange(reader, writer: writer) + return + } + + throw ArrowFlightError.NotImplemented() + } + + func doAction(request: Arrow_Flight_Protocol_Action, responseStream: GRPC.GRPCAsyncResponseStreamWriter, context: GRPC.GRPCAsyncServerCallContext) async throws { + if let server = arrowFlightServer { + try await server.doAction(FlightAction(request), writer: ResultStreamWriter(responseStream)) + return + } + + throw ArrowFlightError.NotImplemented() + } + + func listActions(request: Arrow_Flight_Protocol_Empty, responseStream: GRPC.GRPCAsyncResponseStreamWriter, context: GRPC.GRPCAsyncServerCallContext) async throws { + if let server = arrowFlightServer { + let writer = ActionTypeStreamWriter(responseStream) + try await server.listActions(writer) + return + } + + throw ArrowFlightError.NotImplemented() + } + + internal var interceptors: Arrow_Flight_Protocol_FlightServiceServerInterceptorFactoryProtocol? { get { return nil } } + +} diff --git a/swift/ArrowFlight/Sources/ArrowFlight/FlightTicket.swift b/swift/ArrowFlight/Sources/ArrowFlight/FlightTicket.swift new file mode 100644 index 0000000000000..f77fc3545af5c --- /dev/null +++ b/swift/ArrowFlight/Sources/ArrowFlight/FlightTicket.swift @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import Foundation + +public class FlightTicket { + public let data: Data + init(_ ticket: Arrow_Flight_Protocol_Ticket) { + self.data = ticket.ticket + } + + public init(_ data: Data) { + self.data = data + } + + func toProtocol() -> Arrow_Flight_Protocol_Ticket { + var ticket = Arrow_Flight_Protocol_Ticket() + ticket.ticket = self.data + return ticket + } +} diff --git a/swift/ArrowFlight/Sources/ArrowFlight/RecordBatchStreamReader.swift b/swift/ArrowFlight/Sources/ArrowFlight/RecordBatchStreamReader.swift new file mode 100644 index 0000000000000..a6b9ce93a9acd --- /dev/null +++ b/swift/ArrowFlight/Sources/ArrowFlight/RecordBatchStreamReader.swift @@ -0,0 +1,67 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import Foundation +import Arrow +import GRPC + +public class RecordBatchStreamReader: AsyncSequence, AsyncIteratorProtocol { + public typealias AsyncIterator = RecordBatchStreamReader + public typealias Element = RecordBatch + let reader = ArrowReader() + var batches = [RecordBatch]() + var batchIndex = 0 + var streamIterator: any AsyncIteratorProtocol + let stream: GRPC.GRPCAsyncRequestStream + init(_ stream: GRPC.GRPCAsyncRequestStream) { + self.stream = stream + self.streamIterator = self.stream.makeAsyncIterator() + } + + public func next() async throws -> Arrow.RecordBatch? { + guard !Task.isCancelled else { + return nil + } + + if batchIndex < batches.count { + let batch = batches[batchIndex] + batchIndex += 1 + return batch + } + + while true { + let flightData = try await self.streamIterator.next() + if flightData == nil { + return nil + } + + let data = (flightData as! Arrow_Flight_Protocol_FlightData).dataBody + switch reader.fromStream(data) { + case .success(let rbResult): + batches = rbResult.batches + batchIndex = 1 + return batches[0] + case .failure(let error): + throw error + } + } + } + + public func makeAsyncIterator() -> RecordBatchStreamReader { + self + } +} diff --git a/swift/ArrowFlight/Sources/ArrowFlight/RecordBatchStreamWriter.swift b/swift/ArrowFlight/Sources/ArrowFlight/RecordBatchStreamWriter.swift new file mode 100644 index 0000000000000..1efeba5310369 --- /dev/null +++ b/swift/ArrowFlight/Sources/ArrowFlight/RecordBatchStreamWriter.swift @@ -0,0 +1,91 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import Foundation +import Arrow +import GRPC + +public class ActionTypeStreamWriter { + let stream: GRPCAsyncResponseStreamWriter + init(_ stream: GRPCAsyncResponseStreamWriter) { + self.stream = stream + } + + public func write(_ actionType: FlightActionType) async throws { + try await self.stream.send(actionType.toProtocol()) + } +} + +public class ResultStreamWriter { + let stream: GRPCAsyncResponseStreamWriter + init(_ stream: GRPCAsyncResponseStreamWriter) { + self.stream = stream + } + + public func write(_ result: FlightResult) async throws { + try await self.stream.send(result.toProtocol()) + } +} + +public class FlightInfoStreamWriter { + let stream: GRPCAsyncResponseStreamWriter + init(_ stream: GRPCAsyncResponseStreamWriter) { + self.stream = stream + } + + public func write(_ result: FlightInfo) async throws { + try await self.stream.send(result.toProtocol()) + } +} + +public class PutResultDataStreamWriter { + let stream: GRPCAsyncResponseStreamWriter + init(_ stream: GRPCAsyncResponseStreamWriter) { + self.stream = stream + } + + public func write(_ result: FlightPutResult) async throws { + try await self.stream.send(result.toProtocol()) + } +} + +public class RecordBatchStreamWriter { + let writer = ArrowWriter() + let stream: GRPCAsyncResponseStreamWriter + init(_ stream: GRPCAsyncResponseStreamWriter) { + self.stream = stream + } + + public func write(_ rb: RecordBatch) async throws { + let info = ArrowWriter.Info(.recordbatch, + schema: rb.schema, + batches: [rb] + ) + + let result = writer.toStream(info) + switch result { + case .success(let rbResult): + let data = Arrow_Flight_Protocol_FlightData.with { + $0.dataBody = rbResult + } + + try await self.stream.send(data) + case .failure(let error): + throw error + } + } +} diff --git a/swift/ArrowFlight/Tests/ArrowFlightTests/FlightTest.swift b/swift/ArrowFlight/Tests/ArrowFlightTests/FlightTest.swift new file mode 100644 index 0000000000000..d0db593b10304 --- /dev/null +++ b/swift/ArrowFlight/Tests/ArrowFlightTests/FlightTest.swift @@ -0,0 +1,302 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import XCTest +import struct Foundation.Data +import struct Foundation.URL +import GRPC +import NIOCore +import NIOPosix +import Arrow + +@testable import ArrowFlight + +func makeSchema() -> ArrowSchema { + let schemaBuilder = ArrowSchema.Builder() + return schemaBuilder.addField("col1", type: ArrowType(ArrowType.ArrowUInt8), isNullable: true) + .addField("col2", type: ArrowType(ArrowType.ArrowString), isNullable: false) + .addField("col3", type: ArrowType(ArrowType.ArrowDate32), isNullable: false) + .finish() +} + +func makeRecordBatch() throws -> RecordBatch { + let uint8Builder: NumberArrayBuilder = try ArrowArrayBuilders.loadNumberArrayBuilder() + uint8Builder.append(10) + uint8Builder.append(22) + uint8Builder.append(33) + uint8Builder.append(44) + let stringBuilder = try ArrowArrayBuilders.loadStringArrayBuilder() + stringBuilder.append("test10") + stringBuilder.append("test22") + stringBuilder.append("test33") + stringBuilder.append("test44") + let date32Builder = try ArrowArrayBuilders.loadDate32ArrayBuilder() + let date2 = Date(timeIntervalSinceReferenceDate: 86400 * 1) + let date1 = Date(timeIntervalSinceReferenceDate: 86400 * 5000 + 352) + date32Builder.append(date1) + date32Builder.append(date2) + date32Builder.append(date1) + date32Builder.append(date2) + let intHolder = ArrowArrayHolder(try uint8Builder.finish()) + let stringHolder = ArrowArrayHolder(try stringBuilder.finish()) + let date32Holder = ArrowArrayHolder(try date32Builder.finish()) + let result = RecordBatch.Builder() + .addColumn("col1", arrowArray: intHolder) + .addColumn("col2", arrowArray: stringHolder) + .addColumn("col3", arrowArray: date32Holder) + .finish() + switch result { + case .success(let recordBatch): + return recordBatch + case .failure(let error): + throw error + } +} + +final class MyFlightServer : ArrowFlightServer { + func doExchange(_ reader: ArrowFlight.RecordBatchStreamReader, writer: ArrowFlight.RecordBatchStreamWriter) async throws { + do { + for try await rb in reader { + XCTAssertEqual(rb.schema.fields.count, 3) + XCTAssertEqual(rb.length, 4) + } + + let rb = try makeRecordBatch() + try await writer.write(rb) + } catch { + print("Unknown error: \(error)") + } + } + + func doPut(_ reader: ArrowFlight.RecordBatchStreamReader, writer: ArrowFlight.PutResultDataStreamWriter) async throws { + for try await rb in reader { + XCTAssertEqual(rb.schema.fields.count, 3) + XCTAssertEqual(rb.length, 4) + try await writer.write(FlightPutResult()) + } + } + + func doGet(_ ticket: ArrowFlight.FlightTicket, writer: ArrowFlight.RecordBatchStreamWriter) async throws { + try await writer.write(try makeRecordBatch()) + } + + func getSchema(_ request: ArrowFlight.FlightDescriptor) async throws -> ArrowFlight.FlightSchemaResult { + XCTAssertEqual(String(bytes: request.cmd, encoding: .utf8)!, "schema info") + XCTAssertEqual(request.type, .cmd) + return try ArrowFlight.FlightSchemaResult(schemaToArrowStream(makeSchema())) + } + + func getFlightInfo(_ request: ArrowFlight.FlightDescriptor) async throws -> ArrowFlight.FlightInfo { + return ArrowFlight.FlightInfo(Data()) + } + + func listFlights(_ criteria: ArrowFlight.FlightCriteria, writer: ArrowFlight.FlightInfoStreamWriter) async throws { + XCTAssertEqual(String(bytes: criteria.expression, encoding: .utf8), "flight criteria expression") + let flight_info = try ArrowFlight.FlightInfo(schemaToArrowStream(makeSchema())) + try await writer.write(flight_info) + } + + func listActions(_ writer: ArrowFlight.ActionTypeStreamWriter) async throws { + try await writer.write(FlightActionType("type1", description: "desc1")) + try await writer.write(FlightActionType("type2", description: "desc2")) + } + + func doAction(_ action: FlightAction, writer: ResultStreamWriter) async throws { + XCTAssertEqual(action.type, "test_action") + XCTAssertEqual(String(bytes: action.body, encoding: .utf8)!, "test_action body") + try await writer.write(FlightResult("test_action result".data(using: .utf8)!)) + } +} + +struct FlightServerImpl { + var port = 1234 + static var server: Server? + static var group: MultiThreadedEventLoopGroup? + static func run() async throws { + do { + // Create an event loop group for the server to run on. + let group = MultiThreadedEventLoopGroup(numberOfThreads: System.coreCount) + // Create a provider using the features we read. + let provider = ArrowFlight.MakeFlightServer(MyFlightServer()) + + // Start the server and print its address once it has started. + FlightServerImpl.server = try await Server.insecure(group: group) + .withServiceProviders([provider]) + .bind(host: "localhost", port: 8088) + .get() + + print("server started on port \(server!.channel.localAddress!.port!)") + + // Wait on the server's `onClose` future to stop the program from exiting. + } catch { + print("Unknown server error: \(error)") + } + } +} + +public class FlightClientTester { + var client: FlightClient? + var group: MultiThreadedEventLoopGroup? + var channel: GRPCChannel? + + init() async throws { + // Load the features. + let group = PlatformSupport.makeEventLoopGroup(loopCount: 1) + let channel = try GRPCChannelPool.with( + target: .host("localhost", port: 8088), + transportSecurity: .plaintext, + eventLoopGroup: group + ) + + client = FlightClient(channel: channel) + } + + deinit { + try? group?.syncShutdownGracefully() + try? channel?.close().wait() + } + + func listActionTest() async throws { + var actionTypes = [FlightActionType]() + try await client?.listActions( { action in + actionTypes.append(action) + }) + + XCTAssertEqual(actionTypes.count, 2) + XCTAssertEqual(actionTypes[0].type, "type1") + XCTAssertEqual(actionTypes[0].description, "desc1") + XCTAssertEqual(actionTypes[1].type, "type2") + XCTAssertEqual(actionTypes[1].description, "desc2") + } + + func listFlightsTest() async throws { + let flightCriteria = FlightCriteria("flight criteria expression".data(using: .utf8)!) + var num_calls = 0 + try await client?.listFlights(flightCriteria, closure: { data in + num_calls += 1 + let schema = try streamToArrowSchema(data.schema) + XCTAssertEqual(schema.fields.count, 3) + }) + + XCTAssertEqual(num_calls, 1) + } + + func doActionTest() async throws { + let action = FlightAction("test_action", body: "test_action body".data(using: .utf8)!) + var actionResults = [FlightResult]() + try await client?.doAction(action, closure: { result in + actionResults.append(result) + }) + + XCTAssertEqual(actionResults.count, 1) + XCTAssertEqual(String(bytes:actionResults[0].body, encoding: .utf8), "test_action result") + } + + func getSchemaTest() async throws { + let descriptor = FlightDescriptor(cmd: "schema info".data(using: .utf8)!) + let schemaResult = try await client?.getSchema(descriptor) + let schema = try streamToArrowSchema(schemaResult!.schema) + XCTAssertEqual(schema.fields.count, 3) + } + + func doGetTest() async throws { + let ticket = FlightTicket("flight_ticket test".data(using: .utf8)!) + var num_call = 0 + try await client?.doGet(ticket, readerResultClosure: { rb in + num_call += 1 + XCTAssertEqual(rb.schema!.fields.count, 3) + XCTAssertEqual(rb.batches[0].length, 4) + }) + + XCTAssertEqual(num_call, 1) + } + + func doPutTest() async throws { + let rb = try makeRecordBatch() + var num_call = 0 + try await client?.doPut([rb], closure: { result in + num_call += 1 + }) + + XCTAssertEqual(num_call, 1) + } + + func doExchangeTest() async throws { + let rb = try makeRecordBatch() + var num_call = 0 + try await client?.doExchange([rb], closure: { result in + num_call += 1 + XCTAssertEqual(result.schema?.fields.count, 3) + XCTAssertEqual(result.batches[0].length, 4) + }) + + XCTAssertEqual(num_call, 1) + } +} + +actor FlightServerData { + public var serverup = false + func SetServerUp(_ serverUp: Bool) { + self.serverup = serverUp + } + + func IsServerUp() -> Bool { + return serverup + } +} + +final class FlightTest: XCTestCase { + let serverData = FlightServerData() + + func testFlightServer() async throws { + let basicTask = Task { + try await FlightServerImpl.run() + defer { + print("server shutting down") + try! FlightServerImpl.group?.syncShutdownGracefully() + } + + await serverData.SetServerUp(true) + try await FlightServerImpl.server?.onClose.get() + return "done" + } + + let secondTask = Task { + defer { + _ = FlightServerImpl.server?.close() + } + + while await !serverData.IsServerUp() { + try await Task.sleep(nanoseconds: 1_000_000) + } + + let clientImpl = try await FlightClientTester() + try await clientImpl.listActionTest() + try await clientImpl.listFlightsTest() + try await clientImpl.doActionTest() + try await clientImpl.getSchemaTest() + try await clientImpl.doGetTest() + try await clientImpl.doPutTest() + try await clientImpl.doExchangeTest() + + return "done" + } + + let _ = try await [basicTask.value, secondTask.value] + print("done running") + } +} diff --git a/swift/gen-protobuffers.sh b/swift/gen-protobuffers.sh new file mode 100755 index 0000000000000..383a7a2f3195e --- /dev/null +++ b/swift/gen-protobuffers.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -eu +protoc --swift_out=./ArrowFlight/Sources/ArrowFlight --proto_path=../format Flight.proto +protoc --grpc-swift_out=./ArrowFlight/Sources/ArrowFlight --proto_path=../format Flight.proto +cat <

header.swift +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +HEADER +mv ./ArrowFlight/Sources/ArrowFlight/Flight.grpc.swift ./ArrowFlight/Sources/ArrowFlight/Flight.grpc.swift.orig +cat header.swift ./ArrowFlight/Sources/ArrowFlight/Flight.grpc.swift.orig > ./ArrowFlight/Sources/ArrowFlight/Flight.grpc.swift +rm ./ArrowFlight/Sources/ArrowFlight/Flight.grpc.swift.orig +rm header.swift \ No newline at end of file From 819b7d5ce7febb4af9ea6f69947244f46403ce53 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 20 Jul 2023 12:37:44 +0100 Subject: [PATCH 017/749] GH-36720: [R] stringr modifier functions cannot be called with namespace prefix (#36758) ### Rationale for this change Bug in implementation of string modified functions caused them to be swallowed when prefixed with stringr namespace ### What changes are included in this PR? Strips out the `stringr::` prefix when expressions contain `stringr::fixed`, `stringr::coll`, `string::boundary` or `stringr::regex` ### Are these changes tested? Yes ### Are there any user-facing changes? Yes * Closes: #36720 Lead-authored-by: Nic Crane Co-authored-by: Dewey Dunnington Signed-off-by: Nic Crane --- r/NAMESPACE | 1 + r/R/arrow-package.R | 2 +- r/R/dplyr-funcs-string.R | 18 +++++++++++++ r/tests/testthat/test-dplyr-funcs-string.R | 30 ++++++++++++++++++++++ 4 files changed, 50 insertions(+), 1 deletion(-) diff --git a/r/NAMESPACE b/r/NAMESPACE index aa7b30252bbc0..7eaa51bc5771f 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -443,6 +443,7 @@ importFrom(rlang,as_label) importFrom(rlang,as_quosure) importFrom(rlang,call2) importFrom(rlang,call_args) +importFrom(rlang,call_name) importFrom(rlang,caller_env) importFrom(rlang,check_dots_empty) importFrom(rlang,check_dots_empty0) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 79871d8735c96..8f44f8936bdd3 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -27,7 +27,7 @@ #' @importFrom rlang is_list call2 is_empty as_function as_label arg_match is_symbol is_call call_args #' @importFrom rlang quo_set_env quo_get_env is_formula quo_is_call f_rhs parse_expr f_env new_quosure #' @importFrom rlang new_quosures expr_text caller_env check_dots_empty check_dots_empty0 dots_list is_string inform -#' @importFrom rlang is_bare_list +#' @importFrom rlang is_bare_list call_name #' @importFrom tidyselect vars_pull eval_select eval_rename #' @importFrom glue glue #' @useDynLib arrow, .registration = TRUE diff --git a/r/R/dplyr-funcs-string.R b/r/R/dplyr-funcs-string.R index 436083d9de455..b4becb4081bcb 100644 --- a/r/R/dplyr-funcs-string.R +++ b/r/R/dplyr-funcs-string.R @@ -56,15 +56,33 @@ get_stringr_pattern_options <- function(pattern) { ) } } + ensure_opts <- function(opts) { if (is.character(opts)) { opts <- list(pattern = opts, fixed = FALSE, ignore_case = FALSE) } opts } + + pattern <- clean_pattern_namespace(pattern) + ensure_opts(eval(pattern)) } +# Ensure that e.g. stringr::regex and regex both work within patterns +clean_pattern_namespace <- function(pattern) { + modifier_funcs <- c("fixed", "regex", "coll", "boundary") + if (is_call(pattern, modifier_funcs, ns = "stringr")) { + function_called <- call_name(pattern[1]) + + if (function_called %in% modifier_funcs) { + pattern[1] <- call2(function_called) + } + } + + pattern +} + #' Does this string contain regex metacharacters? #' #' @param string String to be tested diff --git a/r/tests/testthat/test-dplyr-funcs-string.R b/r/tests/testthat/test-dplyr-funcs-string.R index 0dc834dbfea16..fc202bfb3a99e 100644 --- a/r/tests/testthat/test-dplyr-funcs-string.R +++ b/r/tests/testthat/test-dplyr-funcs-string.R @@ -1466,3 +1466,33 @@ test_that("str_remove and str_remove_all", { df ) }) + +test_that("GH-36720: stringr modifier functions can be called with namespace prefix", { + df <- tibble(x = c("Foo", "bar")) + compare_dplyr_binding( + .input %>% + transmute(x = str_replace_all(x, stringr::regex("^f", ignore_case = TRUE), "baz")) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + filter(str_detect(x, stringr::fixed("f", ignore_case = TRUE), negate = TRUE)) %>% + collect(), + df + ) + + x <- Expression$field_ref("x") + + expect_error( + call_binding("str_detect", x, stringr::boundary(type = "character")), + "Pattern modifier `boundary()` not supported in Arrow", + fixed = TRUE + ) + expect_error( + call_binding("str_replace_all", x, stringr::coll("o", locale = "en"), "ó"), + "Pattern modifier `coll()` not supported in Arrow", + fixed = TRUE + ) +}) From f9d8edda36acbd83a552a34392d0df926646d849 Mon Sep 17 00:00:00 2001 From: Mark Wolfe Date: Fri, 21 Jul 2023 00:40:33 +1000 Subject: [PATCH 018/749] =?UTF-8?q?GH-36698:=20[Go][Parquet]=20Add=20a=20T?= =?UTF-8?q?imestampLogicalType=20creation=20function=20=E2=80=A6=20(#36699?= =?UTF-8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …with more options This change introduces a more flexible creation function for TimestampLogicalType which will enable changes to all the flags provided by this type, but without requiring a lot of parameters. Following on from other great examples in arrow it uses the functional options pattern. ### Rationale for this change Add a `TimestampLogicalType` creation function with more options, in particular an option to set `fromConverted` as I can't see another way to set this private struct property after creation. ### What changes are included in this PR? This change introduces a more flexible creation function for `TimestampLogicalType` which will enable changes to all the flags provided by this type, but without requiring a lot of parameters. ### Are these changes tested? Yes I have updated one of the existing tests. ### Are there any user-facing changes? * Closes: #36698 Authored-by: Mark Wolfe Signed-off-by: Matt Topol --- go/parquet/schema/logical_types.go | 49 +++++++++++++++++++++++++ go/parquet/schema/logical_types_test.go | 1 + 2 files changed, 50 insertions(+) diff --git a/go/parquet/schema/logical_types.go b/go/parquet/schema/logical_types.go index ade6e750adacb..4075edc1e9402 100644 --- a/go/parquet/schema/logical_types.go +++ b/go/parquet/schema/logical_types.go @@ -616,6 +616,55 @@ func NewTimestampLogicalTypeForce(isAdjustedToUTC bool, unit TimeUnitType) Logic } } +// TimestampOpt options used with New Timestamp Logical Type +type TimestampOpt func(*TimestampLogicalType) + +// WithTSIsAdjustedToUTC sets the IsAdjustedToUTC field of the timestamp type. +func WithTSIsAdjustedToUTC() TimestampOpt { + return func(t *TimestampLogicalType) { + t.typ.IsAdjustedToUTC = true + } +} + +// WithTSTimeUnitType sets the time unit for the timestamp type +func WithTSTimeUnitType(unit TimeUnitType) TimestampOpt { + return func(t *TimestampLogicalType) { + t.typ.Unit = createTimeUnit(unit) + } +} + +// WithTSForceConverted enable force converted mode +func WithTSForceConverted() TimestampOpt { + return func(t *TimestampLogicalType) { + t.forceConverted = true + } +} + +// WithTSFromConverted enable the timestamp logical type to be +// constructed from a converted type. +func WithTSFromConverted() TimestampOpt { + return func(t *TimestampLogicalType) { + t.fromConverted = true + } +} + +// NewTimestampLogicalTypeWithOpts creates a new TimestampLogicalType with the provided options. +// +// TimestampType Unit defaults to milliseconds (TimeUnitMillis) +func NewTimestampLogicalTypeWithOpts(opts ...TimestampOpt) LogicalType { + ts := &TimestampLogicalType{ + typ: &format.TimestampType{ + Unit: createTimeUnit(TimeUnitMillis), // default to milliseconds + }, + } + + for _, o := range opts { + o(ts) + } + + return ts +} + // TimestampLogicalType represents an int64 number that can be decoded // into a year, month, day, hour, minute, second, and subsecond type TimestampLogicalType struct { diff --git a/go/parquet/schema/logical_types_test.go b/go/parquet/schema/logical_types_test.go index 540899d79a02a..117157f95ef83 100644 --- a/go/parquet/schema/logical_types_test.go +++ b/go/parquet/schema/logical_types_test.go @@ -93,6 +93,7 @@ func TestConvertedTypeCompatibility(t *testing.T) { {"time_micro", schema.NewTimeLogicalType(true /* adjutedToUTC */, schema.TimeUnitMicros), schema.ConvertedTypes.TimeMicros}, {"timestamp_milli", schema.NewTimestampLogicalType(true /* adjutedToUTC */, schema.TimeUnitMillis), schema.ConvertedTypes.TimestampMillis}, {"timestamp_micro", schema.NewTimestampLogicalType(true /* adjutedToUTC */, schema.TimeUnitMicros), schema.ConvertedTypes.TimestampMicros}, + {"timestamp_milli_opts", schema.NewTimestampLogicalTypeWithOpts(schema.WithTSIsAdjustedToUTC(), schema.WithTSTimeUnitType(schema.TimeUnitMillis)), schema.ConvertedTypes.TimestampMillis}, {"uint8", schema.NewIntLogicalType(8 /* bitWidth */, false /* signed */), schema.ConvertedTypes.Uint8}, {"uint16", schema.NewIntLogicalType(16 /* bitWidth */, false /* signed */), schema.ConvertedTypes.Uint16}, {"uint32", schema.NewIntLogicalType(32 /* bitWidth */, false /* signed */), schema.ConvertedTypes.Uint32}, From fca6a6633300694ae9089e92b736ce7d95fd7ed7 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 20 Jul 2023 18:03:12 +0100 Subject: [PATCH 019/749] GH-36787: [R] lintr update leads to failing tests on main (#36788) ### What changes are included in this PR? Turning off the newly introduced indentation linter as it causes test failures, and isn't in sync with styler which we use to style our code. ### Are these changes tested? No, is linter config. ### Are there any user-facing changes? No * Closes: #36787 Authored-by: Nic Crane Signed-off-by: Nic Crane --- r/.lintr | 1 + 1 file changed, 1 insertion(+) diff --git a/r/.lintr b/r/.lintr index 1bd80aff4c62d..085ff45123411 100644 --- a/r/.lintr +++ b/r/.lintr @@ -15,6 +15,7 @@ license: # Licensed to the Apache Software Foundation (ASF) under one # specific language governing permissions and limitations # under the License. linters: linters_with_defaults( + indentation_linter = NULL, line_length_linter = line_length_linter(120), object_name_linter = NULL, # Even with a liberal definition of name styles, some of our names cause issues due to `.`s for s3 classes or NA in the name From 15ee52111b5ee6c1623e8d1826b850b54c6de6a5 Mon Sep 17 00:00:00 2001 From: Mark Wolfe Date: Fri, 21 Jul 2023 05:02:24 +1000 Subject: [PATCH 020/749] GH-36696: [Go] Improve the MapOf and ListOf helpers (#36697) ### Rationale for this change The aim is to improve the MapOf and ListOf helper functions without breaking anything. I have added a `ListOfWithName` which matches the `MapOf` function in that it takes a name, rather than deriving it from the elements name, which should actually be `element`. This just seems clearer to me as an interface, and makes construction a bit more obvious. ### What changes are included in this PR? * Removed references to panics I can't find * Updated error messages for list and map to be clearer with validation errors * Added a ListOfWithName to provide a clearer matching method to MapOf which takes a name Closes #36696 ### Are these changes tested? Yes, I added a test for the new `ListOfWithName` function. ### Are there any user-facing changes? * Closes: #36696 Authored-by: Mark Wolfe Signed-off-by: Matt Topol --- go/parquet/schema/helpers.go | 62 ++++++++++++++++++++----------- go/parquet/schema/helpers_test.go | 19 ++++++++++ 2 files changed, 60 insertions(+), 21 deletions(-) diff --git a/go/parquet/schema/helpers.go b/go/parquet/schema/helpers.go index 7cc89efca6e8e..1198b0b926ac8 100644 --- a/go/parquet/schema/helpers.go +++ b/go/parquet/schema/helpers.go @@ -24,43 +24,62 @@ import ( // ListOf is a convenience helper function to create a properly structured // list structure according to the Parquet Spec. // -// group (LIST) { -// repeated group list { -// element; -// } -// } +// group (LIST) { +// repeated group list { +// element; +// } +// } // -// can only be optional or required. panics if repeated. -// can only be optional or required. panics if repeated. +// can only be optional or required. +// can only be optional or required. func ListOf(n Node, rep parquet.Repetition, fieldID int32) (*GroupNode, error) { - if rep == parquet.Repetitions.Repeated || n.RepetitionType() == parquet.Repetitions.Repeated { - return nil, xerrors.New("parquet: listof repetition and element repetition must not be repeated.") + return ListOfWithName(n.Name(), n, rep, fieldID) +} + +// ListOf is a convenience helper function to create a properly structured +// list structure according to the Parquet Spec. +// +// group (LIST) { +// repeated group list { +// element; +// } +// } +// +// can only be optional or required. +// can only be optional or required. +func ListOfWithName(listName string, element Node, rep parquet.Repetition, fieldID int32) (*GroupNode, error) { + if rep == parquet.Repetitions.Repeated { + return nil, xerrors.Errorf("parquet: listof repetition must not be repeated, got :%s", rep) } - listName := n.Name() - switch n := n.(type) { + if element.RepetitionType() == parquet.Repetitions.Repeated { + return nil, xerrors.Errorf("parquet: element repetition must not be repeated, got: %s", element.RepetitionType()) + } + + switch n := element.(type) { case *PrimitiveNode: n.name = "element" case *GroupNode: n.name = "element" } - list, err := NewGroupNode("list" /* name */, parquet.Repetitions.Repeated, FieldList{n}, -1 /* fieldID */) + list, err := NewGroupNode("list" /* name */, parquet.Repetitions.Repeated, FieldList{element}, -1 /* fieldID */) if err != nil { return nil, err } + return NewGroupNodeLogical(listName, rep, FieldList{list}, ListLogicalType{}, fieldID) } // MapOf is a convenience helper function to create a properly structured // parquet map node setup according to the Parquet Spec. // -// group (MAP) { -// repeated group key_value { -// required key; -// value; -// } -// } +// group (MAP) { +// repeated group key_value { +// required key; +// value; +// } +// } // // key node will be renamed to "key", value node if not nil will be renamed to "value" // @@ -69,14 +88,15 @@ func ListOf(n Node, rep parquet.Repetition, fieldID int32) (*GroupNode, error) { // the key node *must* be required repetition. panics if optional or repeated // // value node can be nil (omitted) or have a repetition of required or optional *only*. -// panics if value node is not nil and has a repetition of repeated. func MapOf(name string, key Node, value Node, mapRep parquet.Repetition, fieldID int32) (*GroupNode, error) { if mapRep == parquet.Repetitions.Repeated { - return nil, xerrors.New("parquet: map repetition cannot be Repeated") + return nil, xerrors.Errorf("parquet: map repetition cannot be Repeated, got: %s", mapRep) } + if key.RepetitionType() != parquet.Repetitions.Required { - return nil, xerrors.New("parquet: map key repetition must be Required") + return nil, xerrors.Errorf("parquet: map key repetition must be Required, got: %s", key.RepetitionType()) } + if value != nil { if value.RepetitionType() == parquet.Repetitions.Repeated { return nil, xerrors.New("parquet: map value cannot have repetition Repeated") diff --git a/go/parquet/schema/helpers_test.go b/go/parquet/schema/helpers_test.go index 055fe7f46d127..b4f0b684003db 100644 --- a/go/parquet/schema/helpers_test.go +++ b/go/parquet/schema/helpers_test.go @@ -62,6 +62,25 @@ func TestListOfNested(t *testing.T) { }`, strings.TrimSpace(buf.String())) } +func TestListOfWithNameNested(t *testing.T) { + n, err := schema.ListOfWithName("arrays", schema.NewInt32Node("element", parquet.Repetitions.Required, -1), parquet.Repetitions.Required, -1) + assert.NoError(t, err) + final, err := schema.ListOf(n, parquet.Repetitions.Required, -1) + assert.NoError(t, err) + + var buf bytes.Buffer + schema.PrintSchema(final, &buf, 4) + assert.Equal(t, + `required group field_id=-1 arrays (List) { + repeated group field_id=-1 list { + required group field_id=-1 element (List) { + repeated group field_id=-1 list { + required int32 field_id=-1 element; + } + } + } +}`, strings.TrimSpace(buf.String())) +} func TestMapOfNestedTypes(t *testing.T) { n, err := schema.NewGroupNode("student", parquet.Repetitions.Required, schema.FieldList{ schema.NewByteArrayNode("name", parquet.Repetitions.Required, -1), From d98b3a18a1985c59295e54c635060caf8a0bbd42 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 20 Jul 2023 17:58:29 -0300 Subject: [PATCH 021/749] GH-36750: [R] Fix test-r-devdocs on MacOS (#36751) ### Rationale for this change The test-r-devdocs job is failing. It is failing because we are pinning a version of R that is so old that CRAN no longer serves binaries (#31757), so some of the package builds are failing. ### What changes are included in this PR? Use the `setup-r-dependencies` action which either installs the correct build dependencies or uses older binary versions to avoid building from source (or both). ### Are these changes tested? Yes, as part of the test-r-devdocs job. ### Are there any user-facing changes? No. * Closes: #36750 Authored-by: Dewey Dunnington Signed-off-by: Sutou Kouhei --- dev/tasks/r/github.devdocs.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/dev/tasks/r/github.devdocs.yml b/dev/tasks/r/github.devdocs.yml index 7126b1418253f..0839e7fc6afcf 100644 --- a/dev/tasks/r/github.devdocs.yml +++ b/dev/tasks/r/github.devdocs.yml @@ -38,10 +38,9 @@ jobs: # remove after https://issues.apache.org/jira/browse/ARROW-16376 r-version: '4.1' - uses: r-lib/actions/setup-pandoc@v2 - - name: Install knitr, rmarkdown - run: | - install.packages(c("rmarkdown", "knitr", "sessioninfo")) - shell: Rscript {0} + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + packages: "rmarkdown, knitr, sessioninfo" - name: Session info run: | options(width = 100) From f43bfd69e97408d06a5de4851aec77f8754bd72f Mon Sep 17 00:00:00 2001 From: lambda <1wei@live.com> Date: Fri, 21 Jul 2023 15:38:32 +0800 Subject: [PATCH 022/749] GH-36770: [C++] Use custom endpoint for s3 using environment variable AWS_ENDPOINT_URL (#36791) ### Rationale for this change we need a way to read custom object storage (such as minio host or other s3-like storage). use environment variable `AWS_ENDPOINT_URL ` ### What changes are included in this PR? set variable endpoint_override according the environment variable ### Are these changes tested? unittest and tested on pyarrow ### Are there any user-facing changes? No * Closes: #36770 Authored-by: yiwei.wang Signed-off-by: Sutou Kouhei --- cpp/src/arrow/filesystem/s3fs.cc | 5 +++++ cpp/src/arrow/filesystem/s3fs_test.cc | 7 +++++++ 2 files changed, 12 insertions(+) diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index c57fc7f291c68..29b45e1dc9abe 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -127,6 +127,7 @@ using internal::ToAwsString; using internal::ToURLEncodedAwsString; static const char kSep = '/'; +constexpr char kAwsEndpointUrlEnvVar[] = "AWS_ENDPOINT_URL"; // ----------------------------------------------------------------------- // S3ProxyOptions implementation @@ -337,6 +338,10 @@ Result S3Options::FromUri(const Uri& uri, std::string* out_path) { } else { options.ConfigureDefaultCredentials(); } + auto endpoint_env = arrow::internal::GetEnvVar(kAwsEndpointUrlEnvVar); + if (endpoint_env.ok()) { + options.endpoint_override = *endpoint_env; + } bool region_set = false; for (const auto& kv : options_map) { diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc b/cpp/src/arrow/filesystem/s3fs_test.cc index 1426fe324b720..718304abaed63 100644 --- a/cpp/src/arrow/filesystem/s3fs_test.cc +++ b/cpp/src/arrow/filesystem/s3fs_test.cc @@ -297,6 +297,13 @@ TEST_F(S3OptionsTest, FromUri) { // Invalid option ASSERT_RAISES(Invalid, S3Options::FromUri("s3://mybucket/?xxx=zzz", &path)); + + // Endpoint from environment variable + { + EnvVarGuard endpoint_guard("AWS_ENDPOINT_URL", "http://127.0.0.1:9000"); + ASSERT_OK_AND_ASSIGN(options, S3Options::FromUri("s3://mybucket/", &path)); + ASSERT_EQ(options.endpoint_override, "http://127.0.0.1:9000"); + } } TEST_F(S3OptionsTest, FromAccessKey) { From b557e85679d7020cd1317262a91f0ee73a6e92e5 Mon Sep 17 00:00:00 2001 From: Curt Hagenlocher Date: Fri, 21 Jul 2023 12:48:59 -0700 Subject: [PATCH 023/749] GH-36812: [C#] Fix C API support to work with .NET desktop framework (#36813) ### What changes are included in this PR? The C API support in the C# library has been modified to work correctly on .NET 4.7.2. The tests have been modified to work correctly on .NET 4.7.2, though that platform is disabled by default as the Python interop seem to cause a hang when unloading the xUnit AppDomain. **This PR contains a "Critical Fix".** * Closes: #36812 Authored-by: Curt Hagenlocher Signed-off-by: Weston Pace --- csharp/src/Apache.Arrow/C/CArrowArray.cs | 14 ++++-- .../src/Apache.Arrow/C/CArrowArrayExporter.cs | 8 ++-- .../src/Apache.Arrow/C/CArrowArrayImporter.cs | 11 +++-- .../src/Apache.Arrow/C/CArrowArrayStream.cs | 39 ++++++++------- .../C/CArrowArrayStreamExporter.cs | 24 ++++------ .../C/CArrowArrayStreamImporter.cs | 25 ++++++++-- csharp/src/Apache.Arrow/C/CArrowSchema.cs | 14 ++++-- .../Apache.Arrow/C/CArrowSchemaExporter.cs | 8 ++-- .../Apache.Arrow/C/CArrowSchemaImporter.cs | 8 +++- .../Apache.Arrow.Tests.csproj | 11 ++++- .../Apache.Arrow.Tests/ArrayBuilderTests.cs | 2 + .../Apache.Arrow.Tests/ArrowArrayTests.cs | 6 +++ .../Apache.Arrow.Tests/ArrowReaderVerifier.cs | 4 ++ .../ArrowStreamReaderTests.cs | 12 +++++ .../CDataInterfaceDataTests.cs | 10 ++-- .../CDataInterfacePythonTests.cs | 48 +++++++++++-------- .../CDataInterfaceSchemaTests.cs | 10 ++-- .../Extensions/Net472Extensions.cs | 34 +++++++++++++ 18 files changed, 200 insertions(+), 88 deletions(-) create mode 100644 csharp/test/Apache.Arrow.Tests/Extensions/Net472Extensions.cs diff --git a/csharp/src/Apache.Arrow/C/CArrowArray.cs b/csharp/src/Apache.Arrow/C/CArrowArray.cs index a8a084d1d767d..fc609f10fdfa5 100644 --- a/csharp/src/Apache.Arrow/C/CArrowArray.cs +++ b/csharp/src/Apache.Arrow/C/CArrowArray.cs @@ -38,11 +38,11 @@ public unsafe struct CArrowArray public byte** buffers; public CArrowArray** children; public CArrowArray* dictionary; - internal delegate* unmanaged -#if !NET5_0_OR_GREATER - [Cdecl] +#if NET5_0_OR_GREATER + internal delegate* unmanaged release; +#else + internal IntPtr release; #endif - release; public void* private_data; /// @@ -68,10 +68,14 @@ internal delegate* unmanaged /// public static void Free(CArrowArray* array) { - if (array->release != null) + if (array->release != default) { // Call release if not already called. +#if NET5_0_OR_GREATER array->release(array); +#else + Marshal.GetDelegateForFunctionPointer(array->release)(array); +#endif } Marshal.FreeHGlobal((IntPtr)array); } diff --git a/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs b/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs index 5a793c177e0a6..16aaa3874b370 100644 --- a/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs +++ b/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs @@ -26,9 +26,9 @@ public static class CArrowArrayExporter #if NET5_0_OR_GREATER private static unsafe delegate* unmanaged ReleaseArrayPtr => &ReleaseArray; #else - private unsafe delegate void ReleaseArrowArray(CArrowArray* cArray); + internal unsafe delegate void ReleaseArrowArray(CArrowArray* cArray); private static unsafe readonly NativeDelegate s_releaseArray = new NativeDelegate(ReleaseArray); - private static unsafe delegate* unmanaged[Cdecl] ReleaseArrayPtr => (delegate* unmanaged[Cdecl])s_releaseArray.Pointer; + private static IntPtr ReleaseArrayPtr => s_releaseArray.Pointer; #endif /// /// Export an to a . Whether or not the @@ -93,7 +93,7 @@ public static unsafe void ExportRecordBatch(RecordBatch batch, CArrowArray* cArr { throw new ArgumentNullException(nameof(cArray)); } - if (cArray->release != null) + if (cArray->release != default) { throw new ArgumentException("Cannot export array to a struct that is already initialized.", nameof(cArray)); } @@ -191,7 +191,7 @@ private unsafe static void ConvertRecordBatch(ExportedAllocationOwner sharedOwne private unsafe static void ReleaseArray(CArrowArray* cArray) { Dispose(&cArray->private_data); - cArray->release = null; + cArray->release = default; } private unsafe static void* FromDisposable(IDisposable disposable) diff --git a/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs b/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs index e1314e5a62253..2f4ebed4b0cf1 100644 --- a/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs +++ b/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs @@ -17,6 +17,7 @@ using System; using System.Collections.Generic; +using System.Runtime.InteropServices; using Apache.Arrow.Memory; using Apache.Arrow.Types; @@ -104,21 +105,25 @@ public ImportedArrowArray(CArrowArray* cArray) { throw new ArgumentNullException(nameof(cArray)); } - if (cArray->release == null) + if (cArray->release == default) { throw new ArgumentException("Tried to import an array that has already been released.", nameof(cArray)); } _cArray = *cArray; - cArray->release = null; + cArray->release = default; } protected override void FinalRelease() { - if (_cArray.release != null) + if (_cArray.release != default) { fixed (CArrowArray* cArray = &_cArray) { +#if NET5_0_OR_GREATER cArray->release(cArray); +#else + Marshal.GetDelegateForFunctionPointer(cArray->release)(cArray); +#endif } } } diff --git a/csharp/src/Apache.Arrow/C/CArrowArrayStream.cs b/csharp/src/Apache.Arrow/C/CArrowArrayStream.cs index a900a6895a097..9cc9984c6ec8f 100644 --- a/csharp/src/Apache.Arrow/C/CArrowArrayStream.cs +++ b/csharp/src/Apache.Arrow/C/CArrowArrayStream.cs @@ -35,11 +35,11 @@ public unsafe struct CArrowArrayStream /// /// Return value: 0 if successful, an `errno`-compatible error code otherwise. /// - internal delegate* unmanaged -#if !NET5_0_OR_GREATER - [Cdecl] +#if NET5_0_OR_GREATER + internal delegate* unmanaged get_schema; +#else + internal IntPtr get_schema; #endif - get_schema; /// /// Callback to get the next array. If no error and the array is released, the stream has ended. @@ -47,11 +47,11 @@ internal delegate* unmanaged /// /// Return value: 0 if successful, an `errno`-compatible error code otherwise. /// - internal delegate* unmanaged -#if !NET5_0_OR_GREATER - [Cdecl] +#if NET5_0_OR_GREATER + internal delegate* unmanaged get_next; +#else + internal IntPtr get_next; #endif - get_next; /// /// Callback to get optional detailed error information. This must only @@ -62,21 +62,21 @@ internal delegate* unmanaged /// Return value: pointer to a null-terminated character array describing the last /// error, or NULL if no description is available. /// - internal delegate* unmanaged -#if !NET5_0_OR_GREATER - [Cdecl] +#if NET5_0_OR_GREATER + internal delegate* unmanaged get_last_error; +#else + internal IntPtr get_last_error; #endif - get_last_error; /// /// Release callback: release the stream's own resources. Note that arrays returned by /// get_next must be individually released. /// - internal delegate* unmanaged -#if !NET5_0_OR_GREATER - [Cdecl] +#if NET5_0_OR_GREATER + internal delegate* unmanaged release; +#else + internal IntPtr release; #endif - release; public void* private_data; @@ -103,10 +103,15 @@ internal delegate* unmanaged /// public static void Free(CArrowArrayStream* arrayStream) { - if (arrayStream->release != null) + if (arrayStream->release != default) { // Call release if not already called. +#if NET5_0_OR_GREATER + arrayStream->release(arrayStream); +#else + Marshal.GetDelegateForFunctionPointer(arrayStream->release)(arrayStream); +#endif } Marshal.FreeHGlobal((IntPtr)arrayStream); } diff --git a/csharp/src/Apache.Arrow/C/CArrowArrayStreamExporter.cs b/csharp/src/Apache.Arrow/C/CArrowArrayStreamExporter.cs index c748eed915d89..0a0f1cc837459 100644 --- a/csharp/src/Apache.Arrow/C/CArrowArrayStreamExporter.cs +++ b/csharp/src/Apache.Arrow/C/CArrowArrayStreamExporter.cs @@ -29,22 +29,18 @@ public static class CArrowArrayStreamExporter private static unsafe delegate* unmanaged GetLastErrorPtr => &GetLastError; private static unsafe delegate* unmanaged ReleasePtr => &Release; #else - private unsafe delegate int GetSchemaArrayStream(CArrowArrayStream* cArrayStream, CArrowSchema* cSchema); + internal unsafe delegate int GetSchemaArrayStream(CArrowArrayStream* cArrayStream, CArrowSchema* cSchema); private static unsafe NativeDelegate s_getSchemaArrayStream = new NativeDelegate(GetSchema); - private static unsafe delegate* unmanaged[Cdecl] GetSchemaPtr => - (delegate* unmanaged[Cdecl])s_getSchemaArrayStream.Pointer; - private unsafe delegate int GetNextArrayStream(CArrowArrayStream* cArrayStream, CArrowArray* cArray); + private static unsafe IntPtr GetSchemaPtr => s_getSchemaArrayStream.Pointer; + internal unsafe delegate int GetNextArrayStream(CArrowArrayStream* cArrayStream, CArrowArray* cArray); private static unsafe NativeDelegate s_getNextArrayStream = new NativeDelegate(GetNext); - private static unsafe delegate* unmanaged[Cdecl] GetNextPtr => - (delegate* unmanaged[Cdecl])s_getNextArrayStream.Pointer; - private unsafe delegate byte* GetLastErrorArrayStream(CArrowArrayStream* cArrayStream); + private static unsafe IntPtr GetNextPtr => s_getNextArrayStream.Pointer; + internal unsafe delegate byte* GetLastErrorArrayStream(CArrowArrayStream* cArrayStream); private static unsafe NativeDelegate s_getLastErrorArrayStream = new NativeDelegate(GetLastError); - private static unsafe delegate* unmanaged[Cdecl] GetLastErrorPtr => - (delegate* unmanaged[Cdecl])s_getLastErrorArrayStream.Pointer; - private unsafe delegate void ReleaseArrayStream(CArrowArrayStream* cArrayStream); + private static unsafe IntPtr GetLastErrorPtr => s_getLastErrorArrayStream.Pointer; + internal unsafe delegate void ReleaseArrayStream(CArrowArrayStream* cArrayStream); private static unsafe NativeDelegate s_releaseArrayStream = new NativeDelegate(Release); - private static unsafe delegate* unmanaged[Cdecl] ReleasePtr => - (delegate* unmanaged[Cdecl])s_releaseArrayStream.Pointer; + private static unsafe IntPtr ReleasePtr => s_releaseArrayStream.Pointer; #endif /// @@ -103,7 +99,7 @@ private unsafe static int GetNext(CArrowArrayStream* cArrayStream, CArrowArray* ExportedArrayStream arrayStream = null; try { - cArray->release = null; + cArray->release = default; arrayStream = ExportedArrayStream.FromPointer(cArrayStream->private_data); RecordBatch recordBatch = arrayStream.ArrowArrayStream.ReadNextRecordBatchAsync().Result; if (recordBatch != null) @@ -140,7 +136,7 @@ private unsafe static int GetNext(CArrowArrayStream* cArrayStream, CArrowArray* private unsafe static void Release(CArrowArrayStream* cArrayStream) { ExportedArrayStream.Free(&cArrayStream->private_data); - cArrayStream->release = null; + cArrayStream->release = default; } sealed unsafe class ExportedArrayStream : IDisposable diff --git a/csharp/src/Apache.Arrow/C/CArrowArrayStreamImporter.cs b/csharp/src/Apache.Arrow/C/CArrowArrayStreamImporter.cs index 7e70632bf82fc..fe0a307c9b26c 100644 --- a/csharp/src/Apache.Arrow/C/CArrowArrayStreamImporter.cs +++ b/csharp/src/Apache.Arrow/C/CArrowArrayStreamImporter.cs @@ -16,6 +16,7 @@ // under the License. using System; +using System.Runtime.InteropServices; using System.Threading; using System.Threading.Tasks; using Apache.Arrow.Ipc; @@ -57,7 +58,11 @@ private sealed unsafe class ImportedArrowArrayStream : IArrowArrayStream internal static string GetLastError(CArrowArrayStream* arrayStream, int errno) { +#if NET5_0_OR_GREATER byte* error = arrayStream->get_last_error(arrayStream); +#else + byte* error = Marshal.GetDelegateForFunctionPointer(arrayStream->get_last_error)(arrayStream); +#endif if (error == null) { return $"Array stream operation failed with no message. Error code: {errno}"; @@ -71,13 +76,17 @@ public ImportedArrowArrayStream(CArrowArrayStream* cArrayStream) { throw new ArgumentNullException(nameof(cArrayStream)); } - if (cArrayStream->release == null) + if (cArrayStream->release == default) { throw new ArgumentException("Tried to import an array stream that has already been released.", nameof(cArrayStream)); } CArrowSchema cSchema = new CArrowSchema(); +#if NET5_0_OR_GREATER int errno = cArrayStream->get_schema(cArrayStream, &cSchema); +#else + int errno = Marshal.GetDelegateForFunctionPointer(cArrayStream->get_schema)(cArrayStream, &cSchema); +#endif if (errno != 0) { throw new Exception(GetLastError(cArrayStream, errno)); @@ -85,7 +94,7 @@ public ImportedArrowArrayStream(CArrowArrayStream* cArrayStream) _schema = CArrowSchemaImporter.ImportSchema(&cSchema); _cArrayStream = *cArrayStream; - cArrayStream->release = null; + cArrayStream->release = default; } ~ImportedArrowArrayStream() @@ -111,12 +120,16 @@ public ValueTask ReadNextRecordBatchAsync(CancellationToken cancell CArrowArray cArray = new CArrowArray(); fixed (CArrowArrayStream* cArrayStream = &_cArrayStream) { +#if NET5_0_OR_GREATER int errno = cArrayStream->get_next(cArrayStream, &cArray); +#else + int errno = Marshal.GetDelegateForFunctionPointer(cArrayStream->get_next)(cArrayStream, &cArray); +#endif if (errno != 0) { return new(Task.FromException(new Exception(GetLastError(cArrayStream, errno)))); } - if (cArray.release != null) + if (cArray.release != default) { result = CArrowArrayImporter.ImportRecordBatch(&cArray, _schema); } @@ -127,12 +140,16 @@ public ValueTask ReadNextRecordBatchAsync(CancellationToken cancell public void Dispose() { - if (!_disposed && _cArrayStream.release != null) + if (!_disposed && _cArrayStream.release != default) { _disposed = true; fixed (CArrowArrayStream* cArrayStream = &_cArrayStream) { +#if NET5_0_OR_GREATER cArrayStream->release(cArrayStream); +#else + Marshal.GetDelegateForFunctionPointer(cArrayStream->release)(cArrayStream); +#endif } } GC.SuppressFinalize(this); diff --git a/csharp/src/Apache.Arrow/C/CArrowSchema.cs b/csharp/src/Apache.Arrow/C/CArrowSchema.cs index 64761dbd0d095..50c363b07720f 100644 --- a/csharp/src/Apache.Arrow/C/CArrowSchema.cs +++ b/csharp/src/Apache.Arrow/C/CArrowSchema.cs @@ -39,11 +39,11 @@ public unsafe struct CArrowSchema public long n_children; public CArrowSchema** children; public CArrowSchema* dictionary; - internal delegate* unmanaged -#if !NET5_0_OR_GREATER - [Cdecl] +#if NET5_0_OR_GREATER + internal delegate* unmanaged release; +#else + internal IntPtr release; #endif - release; public void* private_data; /// @@ -69,10 +69,14 @@ internal delegate* unmanaged /// public static void Free(CArrowSchema* schema) { - if (schema->release != null) + if (schema->release != default) { // Call release if not already called. +#if NET5_0_OR_GREATER schema->release(schema); +#else + Marshal.GetDelegateForFunctionPointer(schema->release)(schema); +#endif } Marshal.FreeHGlobal((IntPtr)schema); } diff --git a/csharp/src/Apache.Arrow/C/CArrowSchemaExporter.cs b/csharp/src/Apache.Arrow/C/CArrowSchemaExporter.cs index 9053e80664e31..696212eda36c7 100644 --- a/csharp/src/Apache.Arrow/C/CArrowSchemaExporter.cs +++ b/csharp/src/Apache.Arrow/C/CArrowSchemaExporter.cs @@ -30,9 +30,9 @@ public static class CArrowSchemaExporter #if NET5_0_OR_GREATER private static unsafe delegate* unmanaged ReleaseSchemaPtr => &ReleaseCArrowSchema; #else - private unsafe delegate void ReleaseArrowSchema(CArrowSchema* cArray); + internal unsafe delegate void ReleaseArrowSchema(CArrowSchema* cArray); private static unsafe readonly NativeDelegate s_releaseSchema = new NativeDelegate(ReleaseCArrowSchema); - private static unsafe delegate* unmanaged[Cdecl] ReleaseSchemaPtr => (delegate* unmanaged[Cdecl])s_releaseSchema.Pointer; + private static IntPtr ReleaseSchemaPtr => s_releaseSchema.Pointer; #endif /// @@ -297,7 +297,7 @@ private unsafe static void WriteMetadataString(ref byte* ptr, int length, string private static unsafe void ReleaseCArrowSchema(CArrowSchema* schema) { if (schema == null) return; - if (schema->release == null) return; + if (schema->release == default) return; Marshal.FreeHGlobal((IntPtr)schema->format); Marshal.FreeHGlobal((IntPtr)schema->name); @@ -324,7 +324,7 @@ private static unsafe void ReleaseCArrowSchema(CArrowSchema* schema) schema->n_children = 0; schema->dictionary = null; schema->children = null; - schema->release = null; + schema->release = default; } } } diff --git a/csharp/src/Apache.Arrow/C/CArrowSchemaImporter.cs b/csharp/src/Apache.Arrow/C/CArrowSchemaImporter.cs index 89c9481270c79..b21f24edba9af 100644 --- a/csharp/src/Apache.Arrow/C/CArrowSchemaImporter.cs +++ b/csharp/src/Apache.Arrow/C/CArrowSchemaImporter.cs @@ -113,7 +113,7 @@ public ImportedArrowSchema(CArrowSchema* cSchema) throw new ArgumentException("Passed null pointer for cSchema."); } _cSchema = cSchema; - if (_cSchema->release == null) + if (_cSchema->release == default) { throw new ArgumentException("Tried to import a schema that has already been released."); } @@ -128,9 +128,13 @@ public ImportedArrowSchema(CArrowSchema* handle, bool isRoot) : this(handle) public void Dispose() { // We only call release on a root-level schema, not child ones. - if (_isRoot && _cSchema->release != null) + if (_isRoot && _cSchema->release != default) { +#if NET5_0_OR_GREATER _cSchema->release(_cSchema); +#else + Marshal.GetDelegateForFunctionPointer(_cSchema->release)(_cSchema); +#endif } } diff --git a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj index 55005a91c74a1..805fb5ab3acce 100644 --- a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj +++ b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj @@ -3,6 +3,9 @@ net7.0 + true @@ -24,5 +27,9 @@ - - \ No newline at end of file + + + + + + diff --git a/csharp/test/Apache.Arrow.Tests/ArrayBuilderTests.cs b/csharp/test/Apache.Arrow.Tests/ArrayBuilderTests.cs index 0c40fd82af7ce..2568e5e8bdab8 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrayBuilderTests.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrayBuilderTests.cs @@ -26,6 +26,7 @@ public class ArrayBuilderTests { // TODO: Test various builder invariants (Append, AppendRange, Clear, Resize, Reserve, etc) +#if NET5_0_OR_GREATER [Fact] public void PrimitiveArrayBuildersProduceExpectedArray() { @@ -73,6 +74,7 @@ static void Test() where TBuilder : PrimitiveArrayBuilder, new() => TestArrayBuilder(x => x.Append(T.CreateChecked(123)).AppendNull().AppendNull().Append(T.CreateChecked(127)), 4, 2, 0x09); } +#endif [Fact] public void BooleanArrayBuilderProducersExpectedArray() diff --git a/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs b/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs index 16fca684ff5ec..d4f0d8dfd0383 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs @@ -93,6 +93,7 @@ void TestIsValid(ArrowBuffer valueBuf, ArrowBuffer nullBitmapBuf, int length, in } } +#if NET5_0_OR_GREATER [Fact] public void SliceArray() { @@ -145,6 +146,7 @@ static void TestNumberSlice() where TBuilder : PrimitiveArrayBuilder, new() => TestSlice(x => x.AppendNull().Append(T.CreateChecked(10)).Append(T.CreateChecked(20)).AppendNull().Append(T.CreateChecked(30))); } +#endif [Fact] public void SliceBooleanArray() @@ -198,7 +200,9 @@ private class ArraySliceValidator : IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, +#if NET5_0_OR_GREATER IArrowArrayVisitor, +#endif IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, @@ -240,7 +244,9 @@ public void Visit(Date64Array array) public void Visit(Time32Array array) => ValidateArrays(array); public void Visit(Time64Array array) => ValidateArrays(array); +#if NET5_0_OR_GREATER public void Visit(HalfFloatArray array) => ValidateArrays(array); +#endif public void Visit(FloatArray array) => ValidateArrays(array); public void Visit(DoubleArray array) => ValidateArrays(array); public void Visit(StringArray array) => ValidateArrays(array); diff --git a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs index acfe72f83195e..543b446bba876 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs @@ -74,7 +74,9 @@ private class ArrayComparer : IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, +#if NET5_0_OR_GREATER IArrowArrayVisitor, +#endif IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, @@ -112,7 +114,9 @@ public ArrayComparer(IArrowArray expectedArray, bool strictCompare) public void Visit(UInt16Array array) => CompareArrays(array); public void Visit(UInt32Array array) => CompareArrays(array); public void Visit(UInt64Array array) => CompareArrays(array); +#if NET5_0_OR_GREATER public void Visit(HalfFloatArray array) => CompareArrays(array); +#endif public void Visit(FloatArray array) => CompareArrays(array); public void Visit(DoubleArray array) => CompareArrays(array); public void Visit(BooleanArray array) => CompareArrays(array); diff --git a/csharp/test/Apache.Arrow.Tests/ArrowStreamReaderTests.cs b/csharp/test/Apache.Arrow.Tests/ArrowStreamReaderTests.cs index 0e8c9d6687a02..ed030cc6ace11 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowStreamReaderTests.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowStreamReaderTests.cs @@ -224,6 +224,7 @@ private class PartialReadStream : MemoryStream // by default return 20 bytes at a time public int PartialReadLength { get; set; } = 20; +#if NET5_0_OR_GREATER public override int Read(Span destination) { if (destination.Length > PartialReadLength) @@ -243,6 +244,17 @@ public override ValueTask ReadAsync(Memory destination, CancellationT return base.ReadAsync(destination, cancellationToken); } +#else + public override int Read(byte[] buffer, int offset, int length) + { + return base.Read(buffer, offset, Math.Min(length, PartialReadLength)); + } + + public override Task ReadAsync(byte[] buffer, int offset, int length, CancellationToken cancellationToken = default) + { + return base.ReadAsync(buffer, offset, Math.Min(length, PartialReadLength), cancellationToken); + } +#endif } } } diff --git a/csharp/test/Apache.Arrow.Tests/CDataInterfaceDataTests.cs b/csharp/test/Apache.Arrow.Tests/CDataInterfaceDataTests.cs index a430e140cfc2a..2bd4d4d661942 100644 --- a/csharp/test/Apache.Arrow.Tests/CDataInterfaceDataTests.cs +++ b/csharp/test/Apache.Arrow.Tests/CDataInterfaceDataTests.cs @@ -47,7 +47,7 @@ public unsafe void InitializeArrayZeroed() Assert.True(cArray->buffers == null); Assert.True(cArray->children == null); Assert.True(cArray->dictionary == null); - Assert.True(cArray->release == null); + Assert.True(cArray->release == default); Assert.True(cArray->private_data == null); CArrowArray.Free(cArray); @@ -59,12 +59,13 @@ public unsafe void CallsReleaseForValid() IArrowArray array = GetTestArray(); CArrowArray* cArray = CArrowArray.Create(); CArrowArrayExporter.ExportArray(array, cArray); - Assert.False(cArray->release == null); + Assert.False(cArray->release == default); CArrowArrayImporter.ImportArray(cArray, array.Data.DataType).Dispose(); - Assert.True(cArray->release == null); + Assert.True(cArray->release == default); CArrowArray.Free(cArray); } +#if NET5_0_OR_GREATER [Fact] public unsafe void CallsReleaseForInvalid() { @@ -75,7 +76,7 @@ public unsafe void CallsReleaseForInvalid() var releaseCallback = (CArrowArray* cArray) => { wasCalled = true; - cArray->release = null; + cArray->release = default; }; cArray->release = (delegate* unmanaged)Marshal.GetFunctionPointerForDelegate( releaseCallback); @@ -90,5 +91,6 @@ public unsafe void CallsReleaseForInvalid() GC.KeepAlive(releaseCallback); } +#endif } } diff --git a/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs b/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs index 084d7bfb014cc..4c53b98e3d9f1 100644 --- a/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs +++ b/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs @@ -28,31 +28,39 @@ namespace Apache.Arrow.Tests { - public class CDataSchemaPythonTest + public class CDataSchemaPythonTest : IClassFixture { - public CDataSchemaPythonTest() + class PythonNet : IDisposable { - bool inCIJob = Environment.GetEnvironmentVariable("GITHUB_ACTIONS") == "true"; - bool inVerificationJob = Environment.GetEnvironmentVariable("TEST_CSHARP") == "1"; - bool pythonSet = Environment.GetEnvironmentVariable("PYTHONNET_PYDLL") != null; - // We only skip if this is not in CI - if (inCIJob && !inVerificationJob && !pythonSet) + public PythonNet() { - throw new Exception("PYTHONNET_PYDLL not set; skipping C Data Interface tests."); - } - else - { - Skip.If(!pythonSet, "PYTHONNET_PYDLL not set; skipping C Data Interface tests."); - } + bool inCIJob = Environment.GetEnvironmentVariable("GITHUB_ACTIONS") == "true"; + bool inVerificationJob = Environment.GetEnvironmentVariable("TEST_CSHARP") == "1"; + bool pythonSet = Environment.GetEnvironmentVariable("PYTHONNET_PYDLL") != null; + // We only skip if this is not in CI + if (inCIJob && !inVerificationJob && !pythonSet) + { + throw new Exception("PYTHONNET_PYDLL not set; skipping C Data Interface tests."); + } + else + { + Skip.If(!pythonSet, "PYTHONNET_PYDLL not set; skipping C Data Interface tests."); + } + + PythonEngine.Initialize(); - PythonEngine.Initialize(); + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && + PythonEngine.PythonPath.IndexOf("dlls", StringComparison.OrdinalIgnoreCase) < 0) + { + dynamic sys = Py.Import("sys"); + sys.path.append(Path.Combine(Path.GetDirectoryName(Environment.GetEnvironmentVariable("PYTHONNET_PYDLL")), "DLLs")); + } + } - if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && - !PythonEngine.PythonPath.Contains("dlls", StringComparison.OrdinalIgnoreCase)) + public void Dispose() { - dynamic sys = Py.Import("sys"); - sys.path.append(Path.Combine(Path.GetDirectoryName(Environment.GetEnvironmentVariable("PYTHONNET_PYDLL")), "DLLs")); + PythonEngine.Shutdown(); } } @@ -360,7 +368,7 @@ public unsafe void ExportType() } // Python should have called release once `exportedPyType` went out-of-scope. - Assert.True(cSchema->release == null); + Assert.True(cSchema->release == default); Assert.True(cSchema->format == null); Assert.Equal(0, cSchema->flags); Assert.Equal(0, cSchema->n_children); @@ -395,7 +403,7 @@ public unsafe void ExportField() // Python should have called release once `exportedPyField` went out-of-scope. Assert.True(cSchema->name == null); - Assert.True(cSchema->release == null); + Assert.True(cSchema->release == default); Assert.True(cSchema->format == null); // Since we allocated, we are responsible for freeing the pointer. diff --git a/csharp/test/Apache.Arrow.Tests/CDataInterfaceSchemaTests.cs b/csharp/test/Apache.Arrow.Tests/CDataInterfaceSchemaTests.cs index dfd6f9912cd4d..4aa5eb6b4d7ed 100644 --- a/csharp/test/Apache.Arrow.Tests/CDataInterfaceSchemaTests.cs +++ b/csharp/test/Apache.Arrow.Tests/CDataInterfaceSchemaTests.cs @@ -35,7 +35,7 @@ public unsafe void InitializeZeroed() Assert.Equal(0, cSchema->n_children); Assert.True(cSchema->children == null); Assert.True(cSchema->dictionary == null); - Assert.True(cSchema->release == null); + Assert.True(cSchema->release == default); Assert.True(cSchema->private_data == null); CArrowSchema.Free(cSchema); @@ -86,12 +86,13 @@ public unsafe void CallsReleaseForValid() { CArrowSchema* cSchema = CArrowSchema.Create(); CArrowSchemaExporter.ExportType(Int32Type.Default, cSchema); - Assert.False(cSchema->release == null); + Assert.False(cSchema->release == default); CArrowSchemaImporter.ImportType(cSchema); - Assert.True(cSchema->release == null); + Assert.True(cSchema->release == default); CArrowSchema.Free(cSchema); } +#if NET5_0_OR_GREATER // can't round-trip marshaled delegate [Fact] public unsafe void CallsReleaseForInvalid() { @@ -103,7 +104,7 @@ public unsafe void CallsReleaseForInvalid() var releaseCallback = (CArrowSchema* cSchema) => { wasCalled = true; - cSchema->release = null; + cSchema->release = default; }; cSchema->release = (delegate* unmanaged)Marshal.GetFunctionPointerForDelegate( releaseCallback); @@ -117,5 +118,6 @@ public unsafe void CallsReleaseForInvalid() GC.KeepAlive(releaseCallback); } +#endif } } diff --git a/csharp/test/Apache.Arrow.Tests/Extensions/Net472Extensions.cs b/csharp/test/Apache.Arrow.Tests/Extensions/Net472Extensions.cs new file mode 100644 index 0000000000000..0b298dec414c0 --- /dev/null +++ b/csharp/test/Apache.Arrow.Tests/Extensions/Net472Extensions.cs @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System.Collections.Generic; + +namespace Apache.Arrow.Tests +{ + internal static class Net472Extensions + { + public static IEnumerable<(TFirst First, TSecond Second)> Zip(this IEnumerable first, IEnumerable second) + { + using (var enumerator1 = first.GetEnumerator()) + using (var enumerator2 = second.GetEnumerator()) + { + while (enumerator1.MoveNext() && enumerator2.MoveNext()) + { + yield return (enumerator1.Current, enumerator2.Current); + } + } + } + } +} From a2a3b9580d66cf40c5df8358f260fdaf5ccc301f Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Sat, 22 Jul 2023 02:16:16 -0300 Subject: [PATCH 024/749] GH-36708: [C++] Fully calculate null-counts so the REE allocations make sense (#36740) ### Rationale for this change When `has_validity_buffer` is true, we expect validity buffers to be allocated, but if null_count is calculated and ends up being 0, `ArrayData::Make()` will sneakily remove the validity buffer from the physical array for us and the assumption that it exists stops holding and causes a crash. Forcing `null_count` calculation with `input.GetNullCount()` ensures `has_validity_buffer` won't be `true` if the `null_count` on the input ends up being 0. ### What changes are included in this PR? The fix and tests to reproduce it. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * Closes: #36708 Authored-by: Felipe Oliveira Carvalho Signed-off-by: Sutou Kouhei --- .../compute/kernels/ree_util_internal.cc | 18 ++-- .../arrow/compute/kernels/ree_util_internal.h | 27 ++++- .../compute/kernels/vector_run_end_encode.cc | 25 +++-- .../kernels/vector_run_end_encode_test.cc | 102 +++++++++++------- 4 files changed, 113 insertions(+), 59 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/ree_util_internal.cc b/cpp/src/arrow/compute/kernels/ree_util_internal.cc index 00c885f6fa9db..d35c000678ba4 100644 --- a/cpp/src/arrow/compute/kernels/ree_util_internal.cc +++ b/cpp/src/arrow/compute/kernels/ree_util_internal.cc @@ -59,7 +59,7 @@ Result> PreallocateRunEndsArray( Result> PreallocateValuesArray( const std::shared_ptr& value_type, bool has_validity_buffer, int64_t length, - int64_t null_count, MemoryPool* pool, int64_t data_buffer_size) { + MemoryPool* pool, int64_t data_buffer_size) { std::vector> values_data_buffers; std::shared_ptr validity_buffer = NULLPTR; if (has_validity_buffer) { @@ -79,20 +79,22 @@ Result> PreallocateValuesArray( } else { values_data_buffers = {std::move(validity_buffer), std::move(values_buffer)}; } - return ArrayData::Make(value_type, length, std::move(values_data_buffers), null_count); + auto data = ArrayData::Make(value_type, length, std::move(values_data_buffers), + kUnknownNullCount); + DCHECK(!(has_validity_buffer && length > 0) || data->buffers[0]); + return data; } Result> PreallocateREEArray( std::shared_ptr ree_type, bool has_validity_buffer, - int64_t logical_length, int64_t physical_length, int64_t physical_null_count, - MemoryPool* pool, int64_t data_buffer_size) { + int64_t logical_length, int64_t physical_length, MemoryPool* pool, + int64_t data_buffer_size) { ARROW_ASSIGN_OR_RAISE( auto run_ends_data, PreallocateRunEndsArray(ree_type->run_end_type(), physical_length, pool)); - ARROW_ASSIGN_OR_RAISE( - auto values_data, - PreallocateValuesArray(ree_type->value_type(), has_validity_buffer, physical_length, - physical_null_count, pool, data_buffer_size)); + ARROW_ASSIGN_OR_RAISE(auto values_data, PreallocateValuesArray( + ree_type->value_type(), has_validity_buffer, + physical_length, pool, data_buffer_size)); return ArrayData::Make(std::move(ree_type), logical_length, {NULLPTR}, {std::move(run_ends_data), std::move(values_data)}, diff --git a/cpp/src/arrow/compute/kernels/ree_util_internal.h b/cpp/src/arrow/compute/kernels/ree_util_internal.h index 080d23c06a1f9..3293e754d3b65 100644 --- a/cpp/src/arrow/compute/kernels/ree_util_internal.h +++ b/cpp/src/arrow/compute/kernels/ree_util_internal.h @@ -333,18 +333,39 @@ Result> PreallocateRunEndsArray( const std::shared_ptr& run_end_type, int64_t physical_length, MemoryPool* pool); +/// \brief Preallocate the physical values array for a run-end encoded array +/// +/// data_buffer_size is passed here pre-calculated so this function doesn't have +/// to be template-specialized for each type. +/// +/// The null_count is left as kUnknownNullCount (or 0 if length is 0) and, if +/// after writing the values, the caller knows the null count, it can be set. +/// +/// \post if has_validity_buffer and length > 0, then data.buffer[0] != NULLPTR +/// +/// \param has_validity_buffer a validity buffer must be allocated +/// \param length the length of the values array +/// \param data_buffer_size the size of the data buffer for string and binary types Result> PreallocateValuesArray( const std::shared_ptr& value_type, bool has_validity_buffer, int64_t length, - int64_t null_count, MemoryPool* pool, int64_t data_buffer_size); + MemoryPool* pool, int64_t data_buffer_size); /// \brief Preallocate the ArrayData for the run-end encoded version /// of the flat input array /// +/// The top-level null_count is set to 0 (REEs keep all the data in child +/// arrays). The null_count of the values array (child_data[1]) is left as +/// kUnknownNullCount (or 0 if physical_length is 0) and, if after writing +/// the values, the caller knows the null count, it can be set. +/// +/// \post if has_validity_buffer and physical_length > 0, then +/// data.child_data[1].buffer[0] != NULLPTR +/// /// \param data_buffer_size the size of the data buffer for string and binary types Result> PreallocateREEArray( std::shared_ptr ree_type, bool has_validity_buffer, - int64_t logical_length, int64_t physical_length, int64_t physical_null_count, - MemoryPool* pool, int64_t data_buffer_size); + int64_t logical_length, int64_t physical_length, MemoryPool* pool, + int64_t data_buffer_size); /// \brief Writes a single run-end to the first slot of the pre-allocated /// run-end encoded array in out diff --git a/cpp/src/arrow/compute/kernels/vector_run_end_encode.cc b/cpp/src/arrow/compute/kernels/vector_run_end_encode.cc index eef816a149c93..943fdcd6b147f 100644 --- a/cpp/src/arrow/compute/kernels/vector_run_end_encode.cc +++ b/cpp/src/arrow/compute/kernels/vector_run_end_encode.cc @@ -179,7 +179,9 @@ class RunEndEncodeImpl { ARROW_ASSIGN_OR_RAISE( auto output_array_data, ree_util::PreallocateREEArray(std::move(ree_type), has_validity_buffer, - input_length, 0, 0, ctx_->memory_pool(), 0)); + /*logical_length=*/input_length, + /*physical_length=*/0, ctx_->memory_pool(), + /*data_buffer_size=*/0)); output_->value = std::move(output_array_data); return Status::OK(); } @@ -196,17 +198,22 @@ class RunEndEncodeImpl { /*output_run_ends=*/NULLPTR); std::tie(num_valid_runs, num_output_runs, data_buffer_size) = counting_loop.CountNumberOfRuns(); + const auto physical_null_count = num_output_runs - num_valid_runs; + DCHECK(!has_validity_buffer || physical_null_count > 0) + << "has_validity_buffer is expected to imply physical_null_count > 0"; ARROW_ASSIGN_OR_RAISE( auto output_array_data, ree_util::PreallocateREEArray( - std::move(ree_type), has_validity_buffer, input_length, num_output_runs, - num_output_runs - num_valid_runs, ctx_->memory_pool(), data_buffer_size)); + std::move(ree_type), has_validity_buffer, /*logical_length=*/input_length, + /*physical_length=*/num_output_runs, ctx_->memory_pool(), data_buffer_size)); // Initialize the output pointers auto* output_run_ends = output_array_data->child_data[0]->template GetMutableValues(1, 0); auto* output_values_array_data = output_array_data->child_data[1].get(); + // Set the null_count on the physical array + output_values_array_data->null_count = physical_null_count; // Second pass: write the runs RunEndEncodingLoop writing_loop( @@ -254,7 +261,7 @@ struct RunEndEncodeExec { return RunEndEncodeNullArray(TypeTraits::type_singleton(), ctx, input_array, result); } else { - const bool has_validity_buffer = input_array.MayHaveNulls(); + const bool has_validity_buffer = input_array.GetNullCount() > 0; if (has_validity_buffer) { return RunEndEncodeImpl(ctx, input_array, result) .Exec(); @@ -398,10 +405,10 @@ class RunEndDecodeImpl { } } - ARROW_ASSIGN_OR_RAISE(auto output_array_data, - ree_util::PreallocateValuesArray( - ree_type->value_type(), has_validity_buffer, length, - kUnknownNullCount, ctx_->memory_pool(), data_buffer_size)); + ARROW_ASSIGN_OR_RAISE( + auto output_array_data, + ree_util::PreallocateValuesArray(ree_type->value_type(), has_validity_buffer, + length, ctx_->memory_pool(), data_buffer_size)); int64_t output_null_count = 0; if (length > 0) { @@ -435,7 +442,7 @@ struct RunEndDecodeExec { return RunEndDecodeNullREEArray(ctx, input_array, result); } else { const bool has_validity_buffer = - arrow::ree_util::ValuesArray(input_array).MayHaveNulls(); + arrow::ree_util::ValuesArray(input_array).GetNullCount() > 0; if (has_validity_buffer) { return RunEndDecodeImpl(ctx, input_array, result) .Exec(); diff --git a/cpp/src/arrow/compute/kernels/vector_run_end_encode_test.cc b/cpp/src/arrow/compute/kernels/vector_run_end_encode_test.cc index f718d82774dcd..0bd8e3386e7cc 100644 --- a/cpp/src/arrow/compute/kernels/vector_run_end_encode_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_run_end_encode_test.cc @@ -72,11 +72,19 @@ struct REETestData { std::vector inputs_json, std::vector expected_values_json, std::vector expected_run_ends_json, - int64_t input_offset = 0) { + int64_t input_offset = 0, + bool force_validity_bitmap = false) { std::vector> inputs; inputs.reserve(inputs_json.size()); for (const auto& input_json : inputs_json) { - inputs.push_back(ArrayFromJSON(data_type, input_json)); + auto chunk = ArrayFromJSON(data_type, input_json); + auto& data = chunk->data(); + if (force_validity_bitmap && !data->HasValidityBitmap()) { + EXPECT_OK_AND_ASSIGN(auto validity, AllocateBitmap(data->length)); + memset(validity->mutable_data(), 0xFF, validity->size()); + data->buffers[0] = std::move(validity); + } + inputs.push_back(std::move(chunk)); } auto chunked_input = std::make_shared(std::move(inputs)); @@ -165,47 +173,52 @@ class TestRunEndEncodeDecode : public ::testing::TestWithParam< DCHECK(datum.is_chunked_array()); return datum.chunked_array(); } -}; - -TEST_P(TestRunEndEncodeDecode, EncodeDecodeArray) { - auto [data, run_end_type] = GetParam(); - ASSERT_OK_AND_ASSIGN( - Datum encoded_datum, - RunEndEncode(data.InputDatum(), RunEndEncodeOptions{run_end_type})); - - auto encoded = AsChunkedArray(encoded_datum); - ASSERT_OK(encoded->ValidateFull()); - ASSERT_EQ(data.input->length(), encoded->length()); + void TestEncodeDecodeArray(REETestData& data, + const std::shared_ptr& run_end_type) { + ASSERT_OK_AND_ASSIGN( + Datum encoded_datum, + RunEndEncode(data.InputDatum(), RunEndEncodeOptions{run_end_type})); + + auto encoded = AsChunkedArray(encoded_datum); + ASSERT_OK(encoded->ValidateFull()); + ASSERT_EQ(data.input->length(), encoded->length()); + + for (int i = 0; i < encoded->num_chunks(); i++) { + auto& chunk = encoded->chunk(i); + auto run_ends_array = MakeArray(chunk->data()->child_data[0]); + auto values_array = MakeArray(chunk->data()->child_data[1]); + ASSERT_OK(chunk->ValidateFull()); + ASSERT_ARRAYS_EQUAL(*ArrayFromJSON(run_end_type, data.expected_run_ends_json[i]), + *run_ends_array); + ASSERT_ARRAYS_EQUAL(*values_array, *data.expected_values[i]); + ASSERT_EQ(chunk->data()->buffers.size(), 1); + ASSERT_EQ(chunk->data()->buffers[0], NULLPTR); + ASSERT_EQ(chunk->data()->child_data.size(), 2); + ASSERT_EQ(run_ends_array->data()->buffers[0], NULLPTR); + ASSERT_EQ(run_ends_array->length(), data.expected_values[i]->length()); + ASSERT_EQ(run_ends_array->offset(), 0); + ASSERT_EQ(chunk->data()->length, data.input->chunk(i)->length()); + ASSERT_EQ(chunk->data()->offset, 0); + ASSERT_EQ(*chunk->data()->type, + RunEndEncodedType(run_end_type, data.input->type())); + ASSERT_EQ(chunk->data()->null_count, 0); + } - for (int i = 0; i < encoded->num_chunks(); i++) { - auto& chunk = encoded->chunk(i); - auto run_ends_array = MakeArray(chunk->data()->child_data[0]); - auto values_array = MakeArray(chunk->data()->child_data[1]); - ASSERT_OK(chunk->ValidateFull()); - ASSERT_ARRAYS_EQUAL(*ArrayFromJSON(run_end_type, data.expected_run_ends_json[i]), - *run_ends_array); - ASSERT_ARRAYS_EQUAL(*values_array, *data.expected_values[i]); - ASSERT_EQ(chunk->data()->buffers.size(), 1); - ASSERT_EQ(chunk->data()->buffers[0], NULLPTR); - ASSERT_EQ(chunk->data()->child_data.size(), 2); - ASSERT_EQ(run_ends_array->data()->buffers[0], NULLPTR); - ASSERT_EQ(run_ends_array->length(), data.expected_values[i]->length()); - ASSERT_EQ(run_ends_array->offset(), 0); - ASSERT_EQ(chunk->data()->length, data.input->chunk(i)->length()); - ASSERT_EQ(chunk->data()->offset, 0); - ASSERT_EQ(*chunk->data()->type, RunEndEncodedType(run_end_type, data.input->type())); - ASSERT_EQ(chunk->data()->null_count, 0); + ASSERT_OK_AND_ASSIGN(Datum decoded_datum, data.chunked + ? RunEndDecode(encoded) + : RunEndDecode(encoded->chunk(0))); + auto decoded = AsChunkedArray(decoded_datum); + ASSERT_OK(decoded->ValidateFull()); + for (int i = 0; i < decoded->num_chunks(); i++) { + ASSERT_ARRAYS_EQUAL(*decoded->chunk(i), *data.input->chunk(i)); + } } +}; - ASSERT_OK_AND_ASSIGN(Datum decoded_datum, data.chunked - ? RunEndDecode(encoded) - : RunEndDecode(encoded->chunk(0))); - auto decoded = AsChunkedArray(decoded_datum); - ASSERT_OK(decoded->ValidateFull()); - for (int i = 0; i < decoded->num_chunks(); i++) { - ASSERT_ARRAYS_EQUAL(*decoded->chunk(i), *data.input->chunk(i)); - } +TEST_P(TestRunEndEncodeDecode, EncodeDecodeArray) { + auto [data, run_end_type] = GetParam(); + TestEncodeDecodeArray(data, run_end_type); } // Encoding an input with an offset results in a completely new encoded array without an @@ -254,6 +267,17 @@ TEST_P(TestRunEndEncodeDecode, DecodeWithOffset) { } } +// GH-36708 +TEST_P(TestRunEndEncodeDecode, InputWithValidityAndNoNulls) { + auto data = + REETestData::JSONChunked(int32(), + /*inputs=*/{"[1, 1, 2, 2, 2, 3]", "[4, 5, 5, 5, 6, 6]"}, + /*expected_values=*/{"[1, 2, 3]", "[4, 5, 6]"}, + /*expected_run_ends=*/{"[2, 5, 6]", "[1, 4, 6]"}, + /*input_offset=*/0, /*force_validity_bitmap=*/true); + TestEncodeDecodeArray(data, int32()); +} + // This test creates an run-end encoded array with an offset in the child array, which // removes the first run in the test data. It's no-op for chunked input. TEST_P(TestRunEndEncodeDecode, DecodeWithOffsetInChildArray) { From de1e40f5270e819191a319678653d0a4c0749181 Mon Sep 17 00:00:00 2001 From: Jin Shang Date: Mon, 24 Jul 2023 09:36:11 +0800 Subject: [PATCH 025/749] MINOR: [Doc] Fix columnar.rst buffer layout table inconsistencies (#36822) ### Rationale for this change 1. Some buffer layouts write out "padding" explicitly while some don't. I believe writing it out is help to distinguish paddings and null values. 2. Most cells start with a space while some don't. ### What changes are included in this PR? Fixed the above two inconsistencies. ### Are these changes tested? No need. ### Are there any user-facing changes? No. Authored-by: Jin Shang Signed-off-by: Sutou Kouhei --- docs/source/format/Columnar.rst | 132 ++++++++++++++++---------------- 1 file changed, 66 insertions(+), 66 deletions(-) diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst index b90e2c97ade73..3390f1b7b5f2c 100644 --- a/docs/source/format/Columnar.rst +++ b/docs/source/format/Columnar.rst @@ -256,15 +256,15 @@ Would look like: :: * Length: 5, Null count: 1 * Validity bitmap buffer: - |Byte 0 (validity bitmap) | Bytes 1-63 | - |-------------------------|-----------------------| - | 00011101 | 0 (padding) | + | Byte 0 (validity bitmap) | Bytes 1-63 | + |--------------------------|-----------------------| + | 00011101 | 0 (padding) | * Value Buffer: - |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-63 | - |------------|-------------|-------------|-------------|-------------|-------------| - | 1 | unspecified | 2 | 4 | 8 | unspecified | + | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-63 | + |-------------|-------------|-------------|-------------|-------------|-----------------------| + | 1 | unspecified | 2 | 4 | 8 | unspecified (padding) | **Example Layout: Non-null int32 Array** @@ -279,9 +279,9 @@ Would look like: :: * Value Buffer: - |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | bytes 12-15 | bytes 16-19 | Bytes 20-63 | - |------------|-------------|-------------|-------------|-------------|-------------| - | 1 | 2 | 3 | 4 | 8 | unspecified | + | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-63 | + |-------------|-------------|-------------|-------------|-------------|-----------------------| + | 1 | 2 | 3 | 4 | 8 | unspecified (padding) | or with the bitmap elided: :: @@ -289,9 +289,9 @@ or with the bitmap elided: :: * Validity bitmap buffer: Not required * Value Buffer: - |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | bytes 12-15 | bytes 16-19 | Bytes 20-63 | - |------------|-------------|-------------|-------------|-------------|-------------| - | 1 | 2 | 3 | 4 | 8 | unspecified | + | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | bytes 12-15 | bytes 16-19 | Bytes 20-63 | + |-------------|-------------|-------------|-------------|-------------|-----------------------| + | 1 | 2 | 3 | 4 | 8 | unspecified (padding) | Variable-size Binary Layout --------------------------- @@ -342,13 +342,13 @@ will be represented as follows: :: | Bytes 0-19 | Bytes 20-63 | |----------------|-----------------------| - | 0, 3, 3, 3, 7 | unspecified | + | 0, 3, 3, 3, 7 | unspecified (padding) | * Value buffer: - | Bytes 0-6 | Bytes 7-63 | - |----------------|----------------------| - | joemark | unspecified | + | Bytes 0-6 | Bytes 7-63 | + |----------------|-----------------------| + | joemark | unspecified (padding) | .. _variable-size-list-layout: @@ -388,18 +388,18 @@ will have the following representation: :: * Offsets buffer (int32) - | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-63 | - |------------|-------------|-------------|-------------|-------------|-------------| - | 0 | 3 | 3 | 7 | 7 | unspecified | + | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-63 | + |------------|-------------|-------------|-------------|-------------|-----------------------| + | 0 | 3 | 3 | 7 | 7 | unspecified (padding) | * Values array (Int8array): * Length: 7, Null count: 0 * Validity bitmap buffer: Not required * Values buffer (int8) - | Bytes 0-6 | Bytes 7-63 | - |------------------------------|-------------| - | 12, -7, 25, 0, -127, 127, 50 | unspecified | + | Bytes 0-6 | Bytes 7-63 | + |------------------------------|-----------------------| + | 12, -7, 25, 0, -127, 127, 50 | unspecified (padding) | **Example Layout: ``List>``** @@ -412,9 +412,9 @@ will be represented as follows: :: * Validity bitmap buffer: Not required * Offsets buffer (int32) - | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-63 | - |------------|------------|------------|-------------|-------------| - | 0 | 2 | 5 | 6 | unspecified | + | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-63 | + |------------|------------|------------|-------------|-----------------------| + | 0 | 2 | 5 | 6 | unspecified (padding) | * Values array (`List`) * Length: 6, Null count: 1 @@ -426,17 +426,17 @@ will be represented as follows: :: * Offsets buffer (int32) - | Bytes 0-27 | Bytes 28-63 | - |----------------------|-------------| - | 0, 2, 4, 7, 7, 8, 10 | unspecified | + | Bytes 0-27 | Bytes 28-63 | + |----------------------|-----------------------| + | 0, 2, 4, 7, 7, 8, 10 | unspecified (padding) | * Values array (Int8): * Length: 10, Null count: 0 * Validity bitmap buffer: Not required - | Bytes 0-9 | Bytes 10-63 | - |-------------------------------|-------------| - | 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 | unspecified | + | Bytes 0-9 | Bytes 10-63 | + |-------------------------------|-----------------------| + | 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 | unspecified (padding) | Fixed-Size List Layout ---------------------- @@ -511,9 +511,9 @@ The layout for ``[{'joe', 1}, {null, 2}, null, {'mark', 4}]`` would be: :: * Length: 4, Null count: 1 * Validity bitmap buffer: - |Byte 0 (validity bitmap) | Bytes 1-63 | - |-------------------------|-----------------------| - | 00001011 | 0 (padding) | + | Byte 0 (validity bitmap) | Bytes 1-63 | + |--------------------------|-----------------------| + | 00001011 | 0 (padding) | * Children arrays: * field-0 array (`VarBinary`): @@ -528,13 +528,13 @@ The layout for ``[{'joe', 1}, {null, 2}, null, {'mark', 4}]`` would be: :: | Bytes 0-19 | Bytes 20-63 | |----------------|-----------------------| - | 0, 3, 3, 3, 7 | unspecified | + | 0, 3, 3, 3, 7 | unspecified (padding) | * Value buffer: | Bytes 0-6 | Bytes 7-63 | |----------------|-----------------------| - | joemark | unspecified | + | joemark | unspecified (padding) | * field-1 array (int32 array): * Length: 4, Null count: 1 @@ -546,9 +546,9 @@ The layout for ``[{'joe', 1}, {null, 2}, null, {'mark', 4}]`` would be: :: * Value Buffer: - |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-63 | - |------------|-------------|-------------|-------------|-------------| - | 1 | 2 | unspecified | 4 | unspecified | + | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-63 | + |-------------|-------------|-------------|-------------|-----------------------| + | 1 | 2 | unspecified | 4 | unspecified (padding) | Struct Validity ~~~~~~~~~~~~~~~ @@ -610,15 +610,15 @@ will have the following layout: :: * Length: 4, Null count: 0 * Types buffer: - |Byte 0 | Byte 1 | Byte 2 | Byte 3 | Bytes 4-63 | - |---------|-------------|----------|----------|-------------| - | 0 | 0 | 0 | 1 | unspecified | + | Byte 0 | Byte 1 | Byte 2 | Byte 3 | Bytes 4-63 | + |----------|-------------|----------|----------|-----------------------| + | 0 | 0 | 0 | 1 | unspecified (padding) | * Offset buffer: - |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-63 | - |----------|-------------|------------|-------------|-------------| - | 0 | 1 | 2 | 0 | unspecified | + | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-63 | + |-----------|-------------|------------|-------------|-----------------------| + | 0 | 1 | 2 | 0 | unspecified (padding) | * Children arrays: * Field-0 array (f: Float32): @@ -627,9 +627,9 @@ will have the following layout: :: * Value Buffer: - | Bytes 0-11 | Bytes 12-63 | - |----------------|-------------| - | 1.2, null, 3.4 | unspecified | + | Bytes 0-11 | Bytes 12-63 | + |----------------|-----------------------| + | 1.2, null, 3.4 | unspecified (padding) | * Field-1 array (i: Int32): @@ -638,9 +638,9 @@ will have the following layout: :: * Value Buffer: - | Bytes 0-3 | Bytes 4-63 | - |-----------|-------------| - | 5 | unspecified | + | Bytes 0-3 | Bytes 4-63 | + |-----------|-----------------------| + | 5 | unspecified (padding) | Sparse Union ~~~~~~~~~~~~ @@ -677,29 +677,29 @@ will have the following layout: :: * Length: 6, Null count: 4 * Validity bitmap buffer: - |Byte 0 (validity bitmap) | Bytes 1-63 | - |-------------------------|-----------------------| - |00010001 | 0 (padding) | + | Byte 0 (validity bitmap) | Bytes 1-63 | + |--------------------------|-----------------------| + | 00010001 | 0 (padding) | * Value buffer: - |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-23 | Bytes 24-63 | - |------------|-------------|-------------|-------------|-------------|--------------|-----------------------| - | 5 | unspecified | unspecified | unspecified | 4 | unspecified | unspecified (padding) | + | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-23 | Bytes 24-63 | + |-------------|-------------|-------------|-------------|-------------|--------------|-----------------------| + | 5 | unspecified | unspecified | unspecified | 4 | unspecified | unspecified (padding) | * f (Float32): * Length: 6, Null count: 4 * Validity bitmap buffer: - |Byte 0 (validity bitmap) | Bytes 1-63 | - |-------------------------|-----------------------| - | 00001010 | 0 (padding) | + | Byte 0 (validity bitmap) | Bytes 1-63 | + |--------------------------|-----------------------| + | 00001010 | 0 (padding) | * Value buffer: - |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-23 | Bytes 24-63 | - |-------------|-------------|-------------|-------------|-------------|--------------|-----------------------| - | unspecified | 1.2 | unspecified | 3.4 | unspecified | unspecified | unspecified (padding) | + | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-23 | Bytes 24-63 | + |--------------|-------------|-------------|-------------|-------------|-------------|-----------------------| + | unspecified | 1.2 | unspecified | 3.4 | unspecified | unspecified | unspecified (padding) | * s (`VarBinary`) * Length: 6, Null count: 4 @@ -711,9 +711,9 @@ will have the following layout: :: * Offsets buffer (Int32) - | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-23 | Bytes 24-27 | Bytes 28-63 | - |------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------| - | 0 | 0 | 0 | 3 | 3 | 3 | 7 | unspecified | + | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-23 | Bytes 24-27 | Bytes 28-63 | + |------------|-------------|-------------|-------------|-------------|-------------|-------------|------------------------| + | 0 | 0 | 0 | 3 | 3 | 3 | 7 | unspecified (padding) | * Values buffer: From 22339a28ea0c736d18e43ae7e238e2ea580be2d2 Mon Sep 17 00:00:00 2001 From: lambda <1wei@live.com> Date: Mon, 24 Jul 2023 09:38:33 +0800 Subject: [PATCH 026/749] MINOR: [Docs] Update document for AWS_ENDPOINT_URL environment variable (#36826) ### Rationale for this change update document for AWS_ENDPOINT_URL environment variable ### What changes are included in this PR? update document for AWS_ENDPOINT_URL environment variable ### Are these changes tested? ### Are there any user-facing changes? No Authored-by: yiwei.wang Signed-off-by: Sutou Kouhei --- docs/source/cpp/env_vars.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/source/cpp/env_vars.rst b/docs/source/cpp/env_vars.rst index e8490735926c1..8d10fd2cc2e40 100644 --- a/docs/source/cpp/env_vars.rst +++ b/docs/source/cpp/env_vars.rst @@ -138,6 +138,10 @@ that changing their value later will have an effect. SIGILL (Illegal Instruction). User must rebuild Arrow and PyArrow from scratch by setting cmake option ``ARROW_SIMD_LEVEL=NONE``. +.. envvar:: AWS_ENDPOINT_URL + + Endpoint URL used for S3-like storage, for example Minio or s3.scality. + .. envvar:: GANDIVA_CACHE_SIZE The number of entries to keep in the Gandiva JIT compilation cache. From 78bbe7676c1437a123f0ce3f4ba4089917f2be0a Mon Sep 17 00:00:00 2001 From: mwish Date: Mon, 24 Jul 2023 17:05:51 +0800 Subject: [PATCH 027/749] GH-36828: [C++][Parquet] Make buffered RowGroupSerializer using BufferedPageWriter (#36829) ### Rationale for this change See https://github.com/apache/arrow/issues/36828 ### What changes are included in this PR? Add `buffered` argument when building `PageWriter` ### Are these changes tested? no ### Are there any user-facing changes? no * Closes: #36828 Authored-by: mwish Signed-off-by: Gang Wu --- cpp/src/parquet/file_writer.cc | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index ef86e742362b8..2a6a88df2dd0a 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -310,17 +310,17 @@ class RowGroupSerializer : public RowGroupWriter::Contents { std::unique_ptr pager; if (!codec_options) { - pager = PageWriter::Open(sink_, properties_->compression(path), col_meta, - row_group_ordinal_, static_cast(column_ordinal), - properties_->memory_pool(), false, meta_encryptor, - data_encryptor, properties_->page_checksum_enabled(), - ci_builder, oi_builder, CodecOptions()); + pager = PageWriter::Open( + sink_, properties_->compression(path), col_meta, row_group_ordinal_, + static_cast(column_ordinal), properties_->memory_pool(), + buffered_row_group_, meta_encryptor, data_encryptor, + properties_->page_checksum_enabled(), ci_builder, oi_builder, CodecOptions()); } else { - pager = PageWriter::Open(sink_, properties_->compression(path), col_meta, - row_group_ordinal_, static_cast(column_ordinal), - properties_->memory_pool(), false, meta_encryptor, - data_encryptor, properties_->page_checksum_enabled(), - ci_builder, oi_builder, *codec_options); + pager = PageWriter::Open( + sink_, properties_->compression(path), col_meta, row_group_ordinal_, + static_cast(column_ordinal), properties_->memory_pool(), + buffered_row_group_, meta_encryptor, data_encryptor, + properties_->page_checksum_enabled(), ci_builder, oi_builder, *codec_options); } column_writers_.push_back( ColumnWriter::Make(col_meta, std::move(pager), properties_)); From fdf392d48e79253838c4ff2e1d2bd127cdf8fead Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 24 Jul 2023 12:09:22 +0100 Subject: [PATCH 028/749] MINOR: [R] Bump versions following 12.0.1.1 release (#36801) ### Rationale for this change Bumping version numbers after 12.0.1.1 release (this is a manual process for CRAN-only releases) Authored-by: Nic Crane Signed-off-by: Nic Crane --- r/pkgdown/assets/versions.html | 5 +++-- r/pkgdown/assets/versions.json | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/r/pkgdown/assets/versions.html b/r/pkgdown/assets/versions.html index 9c80c32735a85..31f393a27785d 100644 --- a/r/pkgdown/assets/versions.html +++ b/r/pkgdown/assets/versions.html @@ -1,7 +1,8 @@ -

11.0.0.9000 (dev)

-

11.0.0 (release)

+

12.0.1.9000 (dev)

+

12.0.1.1 (release)

+

11.0.0.3

10.0.1

9.0.0

8.0.0

diff --git a/r/pkgdown/assets/versions.json b/r/pkgdown/assets/versions.json index c7500309b22df..4d7658e6e5e25 100644 --- a/r/pkgdown/assets/versions.json +++ b/r/pkgdown/assets/versions.json @@ -4,7 +4,7 @@ "version": "dev/" }, { - "name": "12.0.1 (release)", + "name": "12.0.1.1 (release)", "version": "" }, { From a9f100c690ed8608142ec3d9af043b66a41543e0 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 24 Jul 2023 12:16:40 +0100 Subject: [PATCH 029/749] GH-36805: [R] Update NEWS.md for 13.0.0 (#36806) ### Rationale for this change Update NEWS.md for 13.0.0 * Closes: #36805 Authored-by: Nic Crane Signed-off-by: Nic Crane --- r/NEWS.md | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/r/NEWS.md b/r/NEWS.md index 45730a7b36018..6d09355170a67 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -19,6 +19,36 @@ # arrow 12.0.1.9000 +## New features + +* `open_dataset()` now works with ND-JSON files (#35055) +* Calling `schema()` on multiple Arrow objects now returns the object's schema (#35543) +* dplyr `.by`/`by` argument now supported in arrow implementation of dplyr verbs (@eitsupi, #35667) + +## Minor improvements and fixes + +* Convenience function `arrow_array()` can be used to create Arrow Arrays (#36381) +* Convenience function `scalar()` can be used to create Arrow Scalars (#36265) +* Prevent crashed when passing data between arrow and duckdb by always calling `RecordBatchReader::ReadNext()` from DuckDB from the main R thread (#36307) +* Issue a warning for `set_io_thread_count()` with `num_threads` < 2 (#36304) +* Ensure missing grouping variables are added to the beginning of the variable list (#36305) +* CSV File reader options class objects can print the selected values (#35955) +* Schema metadata can be set as a named character vector (#35954) +* Ensure that the RStringViewer helper class does not own any Array references (#35812) +* `strptime()` in arrow will return a timezone-aware timestamp if `%z` is part of the format string (#35671) +* Column ordering when combining `group_by()` and `across()` now matches dplyr (@eitsupi, #35473) + +## Installation + +* Link to correct version of OpenSSL when using autobrew (#36551) +* Require cmake 3.16 in bundled build script (#36321) + +## Docs + +* Split out R6 classes and convenience functions to improve readability (#36394) +* Enable pkgdown built-in search (@eitsupi, #36374) +* Re-organise reference page on pkgdown site to improve readability (#36171) + # arrow 12.0.1.1 * Update a package version reference to be text only instead of numeric due to CRAN update requiring this (#36353, #36364) From c5444126b3470bab06658ceea9978e9da6e4b9c6 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Mon, 24 Jul 2023 12:41:39 -0300 Subject: [PATCH 030/749] GH-36824: [C++] Improve the test tracing of CheckWithDifferentShapes in the if-else kernel tests (#36825) ### Rationale for this change The traces might show that a call is being made with a conditional scalar, but then a call is made with an array. If there is a bug in the kernel, it's hard to know which specialization is to blame given the trace. ### What changes are included in this PR? A simplification of the `CheckWithDifferentShapes` and addition of a bit more tracing information. ### Are these changes tested? Yes. The changes are only made to test code and I verified that bugs in the implementation get caught by the new code. ### Are there any user-facing changes? No. * Closes: #36824 Authored-by: Felipe Oliveira Carvalho Signed-off-by: Antoine Pitrou --- .../compute/kernels/scalar_if_else_test.cc | 168 +++++++++++------- 1 file changed, 104 insertions(+), 64 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc index 879d6285f3441..ded73f0371435 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc @@ -126,75 +126,115 @@ TYPED_TEST(TestIfElsePrimitive, IfElseFixedSizeRand) { CheckIfElseOutput(cond, left, right, expected_data); } -void CheckWithDifferentShapes(const std::shared_ptr& cond, - const std::shared_ptr& left, - const std::shared_ptr& right, - const std::shared_ptr& expected) { - // this will check for whole arrays, every scalar at i'th index and slicing (offset) - CheckScalar("if_else", {cond, left, right}, expected); - - auto len = left->length(); - std::vector array_indices = {-1}; // sentinel for make_input - std::vector scalar_indices(len); - std::iota(scalar_indices.begin(), scalar_indices.end(), 0); - auto make_input = [&](const std::shared_ptr& array, int64_t index, Datum* input, - Datum* input_broadcast, std::string* trace) { - if (index >= 0) { - // Use scalar from array[index] as input; broadcast scalar for computing expected - // result - ASSERT_OK_AND_ASSIGN(auto scalar, array->GetScalar(index)); - *trace += "@" + std::to_string(index) + "=" + scalar->ToString(); - *input = std::move(scalar); - ASSERT_OK_AND_ASSIGN(*input_broadcast, MakeArrayFromScalar(*input->scalar(), len)); +Datum ArrayOrBroadcastScalar(const Datum& input, int64_t length) { + if (input.is_scalar()) { + EXPECT_OK_AND_ASSIGN(auto array, MakeArrayFromScalar(*input.scalar(), length)); + return array; + } + EXPECT_TRUE(input.is_array()); + return input; +} + +Result ExpectedFromIfElse( + const Datum& cond, const Datum& left, const Datum& right, + std::shared_ptr type, + const std::shared_ptr& expected_if_all_operands_are_arrays) { + if (cond.is_scalar() && left.is_scalar() && right.is_scalar()) { + const auto& scalar = cond.scalar_as(); + Datum expected; + if (scalar.is_valid) { + expected = scalar.value ? left : right; } else { - // Use array as input - *trace += "=Array"; - *input = *input_broadcast = array; + expected = MakeNullScalar(left.type()); } - }; - - enum { COND_SCALAR = 1, LEFT_SCALAR = 2, RIGHT_SCALAR = 4 }; - for (int mask = 1; mask <= (COND_SCALAR | LEFT_SCALAR | RIGHT_SCALAR); ++mask) { - for (int64_t cond_idx : (mask & COND_SCALAR) ? scalar_indices : array_indices) { - Datum cond_in, cond_bcast; - std::string trace_cond = "Cond"; - make_input(cond, cond_idx, &cond_in, &cond_bcast, &trace_cond); - - for (int64_t left_idx : (mask & LEFT_SCALAR) ? scalar_indices : array_indices) { - Datum left_in, left_bcast; - std::string trace_left = "Left"; - make_input(left, left_idx, &left_in, &left_bcast, &trace_left); - - for (int64_t right_idx : (mask & RIGHT_SCALAR) ? scalar_indices : array_indices) { - Datum right_in, right_bcast; - std::string trace_right = "Right"; - make_input(right, right_idx, &right_in, &right_bcast, &trace_right); - - SCOPED_TRACE(trace_right); - SCOPED_TRACE(trace_left); - SCOPED_TRACE(trace_cond); - - Datum expected; - ASSERT_OK_AND_ASSIGN(auto actual, IfElse(cond_in, left_in, right_in)); - if (mask == (COND_SCALAR | LEFT_SCALAR | RIGHT_SCALAR)) { - const auto& scalar = cond_in.scalar_as(); - if (scalar.is_valid) { - expected = scalar.value ? left_in : right_in; - } else { - expected = MakeNullScalar(left_in.type()); - } - if (!left_in.type()->Equals(*right_in.type())) { - ASSERT_OK_AND_ASSIGN(expected, - Cast(expected, CastOptions::Safe(actual.type()))); - } - } else { - ASSERT_OK_AND_ASSIGN(expected, IfElse(cond_bcast, left_bcast, right_bcast)); - } - AssertDatumsEqual(expected, actual, /*verbose=*/true); + if (!left.type()->Equals(*right.type())) { + return Cast(expected, CastOptions::Safe(std::move(type))); + } + return expected; + } + if (cond.is_array() && left.is_array() && right.is_array()) { + return expected_if_all_operands_are_arrays; + } + // When at least one of the inputs is an array, we expect the output + // to be the same as if all the scalars were broadcast to arrays. + const auto expected_length = + std::max(cond.length(), std::max(left.length(), right.length())); + SCOPED_TRACE("IfElseAAACall"); + return IfElse(ArrayOrBroadcastScalar(cond, expected_length), + ArrayOrBroadcastScalar(left, expected_length), + ArrayOrBroadcastScalar(right, expected_length)); +} + +bool NextScalarOrWholeArray(const std::shared_ptr& array, int* index, Datum* out) { + if (*index <= array->length()) { + if (*index < array->length()) { + EXPECT_OK_AND_ASSIGN(auto scalar, array->GetScalar(*index)); + *out = std::move(scalar); + } else { + *out = array; + } + *index += 1; + return true; + } + return false; +} + +std::string CodedCallName(const Datum& cond, const Datum& left, const Datum& right) { + std::string coded = "IfElse"; + coded += cond.is_scalar() ? "S" : "A"; + coded += left.is_scalar() ? "S" : "A"; + coded += right.is_scalar() ? "S" : "A"; + coded += "Call"; + return coded; +} + +void DoCheckWithDifferentShapes(const std::shared_ptr& cond, + const std::shared_ptr& left, + const std::shared_ptr& right, + const std::shared_ptr& expected) { + auto make_trace([&](const char* name, const Datum& datum, int index) { + std::string trace = name; + trace += " : "; + if (datum.is_scalar()) { + trace += "Scalar@" + std::to_string(index) + " = " + datum.scalar()->ToString(); + } else { + EXPECT_TRUE(datum.is_array()); + trace += "Array = [...]"; + } + return trace; + }); + Datum cond_in; + Datum left_in; + Datum right_in; + int cond_index = 0; + int left_index = 0; + int right_index = 0; + while (NextScalarOrWholeArray(cond, &cond_index, &cond_in)) { + SCOPED_TRACE(make_trace("Cond", cond_in, cond_index)); + while (NextScalarOrWholeArray(left, &left_index, &left_in)) { + SCOPED_TRACE(make_trace("Left", left_in, left_index)); + while (NextScalarOrWholeArray(right, &right_index, &right_in)) { + SCOPED_TRACE(make_trace("Right", right_in, right_index)); + Datum actual; + { + SCOPED_TRACE(CodedCallName(cond_in, left_in, right_in)); + ASSERT_OK_AND_ASSIGN(actual, IfElse(cond_in, left_in, right_in)); } + ASSERT_OK_AND_ASSIGN( + auto adjusted_expected, + ExpectedFromIfElse(cond_in, left_in, right_in, actual.type(), expected)); + AssertDatumsEqual(adjusted_expected, actual, /*verbose=*/true); } } - } // for (mask) + } +} + +void CheckWithDifferentShapes(const std::shared_ptr& cond, + const std::shared_ptr& left, + const std::shared_ptr& right, + const std::shared_ptr& expected) { + CheckScalar("if_else", {cond, left, right}, expected); + DoCheckWithDifferentShapes(cond, left, right, expected); } TYPED_TEST(TestIfElsePrimitive, IfElseFixedSize) { From b31977fd22afdb44cc8344f7814d1f6cd507a964 Mon Sep 17 00:00:00 2001 From: mwish Date: Mon, 24 Jul 2023 23:55:55 +0800 Subject: [PATCH 031/749] GH-36773: [C++][Parquet] Avoid calculating prebuffer column bitmap multiple times (#36774) ### Rationale for this change According to https://github.com/apache/arrow/pull/36192 and https://github.com/apache/arrow/pull/36649 . RowGroupReader using a bitmap to control a column-level prebuffer. However, if all columns are selected, this will be a heavy overhead for building a bitmap multiple times. ### What changes are included in this PR? Build `Prebuffer` Bitmap once, and reuse that vector. ### Are these changes tested? no ### Are there any user-facing changes? no * Closes: #36773 Authored-by: mwish Signed-off-by: Antoine Pitrou --- cpp/src/parquet/file_reader.cc | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index adda9a027bded..08d493b0bca2f 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -179,17 +179,17 @@ class SerializedRowGroup : public RowGroupReader::Contents { SerializedRowGroup(std::shared_ptr source, std::shared_ptr<::arrow::io::internal::ReadRangeCache> cached_source, int64_t source_size, FileMetaData* file_metadata, - int row_group_number, const ReaderProperties& props, + int row_group_number, ReaderProperties props, std::shared_ptr prebuffered_column_chunks_bitmap, std::shared_ptr file_decryptor = nullptr) : source_(std::move(source)), cached_source_(std::move(cached_source)), source_size_(source_size), file_metadata_(file_metadata), - properties_(props), + properties_(std::move(props)), row_group_ordinal_(row_group_number), prebuffered_column_chunks_bitmap_(std::move(prebuffered_column_chunks_bitmap)), - file_decryptor_(file_decryptor) { + file_decryptor_(std::move(file_decryptor)) { row_group_metadata_ = file_metadata->RowGroup(row_group_number); } @@ -273,7 +273,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::unique_ptr row_group_metadata_; ReaderProperties properties_; int row_group_ordinal_; - const std::shared_ptr prebuffered_column_chunks_bitmap_; + const std::shared_ptr prebuffered_column_chunks_bitmap_; std::shared_ptr file_decryptor_; }; @@ -366,13 +366,19 @@ class SerializedFile : public ParquetFileReader::Contents { std::make_shared<::arrow::io::internal::ReadRangeCache>(source_, ctx, options); std::vector<::arrow::io::ReadRange> ranges; prebuffered_column_chunks_.clear(); + int num_cols = file_metadata_->num_columns(); + // a bitmap for buffered columns. + std::shared_ptr buffer_columns; + if (!row_groups.empty()) { + PARQUET_THROW_NOT_OK(AllocateEmptyBitmap(num_cols, properties_.memory_pool()) + .Value(&buffer_columns)); + for (int col : column_indices) { + ::arrow::bit_util::SetBit(buffer_columns->mutable_data(), col); + } + } for (int row : row_groups) { - std::shared_ptr& col_bitmap = prebuffered_column_chunks_[row]; - int num_cols = file_metadata_->num_columns(); - PARQUET_THROW_NOT_OK( - AllocateEmptyBitmap(num_cols, properties_.memory_pool()).Value(&col_bitmap)); + prebuffered_column_chunks_[row] = buffer_columns; for (int col : column_indices) { - ::arrow::bit_util::SetBit(col_bitmap->mutable_data(), col); ranges.push_back( ComputeColumnChunkRange(file_metadata_.get(), source_size_, row, col)); } From 8b7457e85926e0e7ba91e4ca92982aeff6a26ec4 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 24 Jul 2023 19:26:57 +0100 Subject: [PATCH 032/749] MINOR: [R] Add NEWS gathering instructions (#36804) ### Rationale for this change Add instructions on how to find relevant commits to use as basis of updating NEWS.md during a release Authored-by: Nic Crane Signed-off-by: Nic Crane --- r/PACKAGING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/PACKAGING.md b/r/PACKAGING.md index 6cfa903650729..ef996c88cf2cf 100644 --- a/r/PACKAGING.md +++ b/r/PACKAGING.md @@ -31,7 +31,7 @@ For a high-level overview of the release process see the - [ ] Ensure the contents of the README are accurate and up to date. - [ ] Run `urlchecker::url_check()` on the R directory at the release candidate. commit. Ignore any errors with badges as they will be removed in the CRAN release branch. -- [ ] [Polish NEWS](https://style.tidyverse.org/news.html#news-release) but do **not** update version numbers (this is done automatically later). +- [ ] [Polish NEWS](https://style.tidyverse.org/news.html#news-release) but do **not** update version numbers (this is done automatically later). You can find commits by, for example, `git log --oneline aa057d0..HEAD | grep "\[R\]"` - [ ] Run preliminary reverse dependency checks using `archery docker run r-revdepcheck`. - [ ] For major releases, prepare tweet thread highlighting new features. From 362387a3e2e63d5f9f15634a0ebc85b03def3759 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 24 Jul 2023 19:27:23 +0100 Subject: [PATCH 033/749] MINOR: [R] Update instructions for bumping NEWS.md (#36803) ### Rationale for this change Update for release instructions to make things more clear, based on my experience doing a release recently. Authored-by: Nic Crane Signed-off-by: Nic Crane --- r/PACKAGING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/PACKAGING.md b/r/PACKAGING.md index ef996c88cf2cf..edfca651e9d38 100644 --- a/r/PACKAGING.md +++ b/r/PACKAGING.md @@ -126,7 +126,7 @@ Wait for CRAN... - [ ] Tag the tip of the CRAN-specific release branch - [ ] Add a new line to the matrix in the [backwards compatability job](https://github.com/apache/arrow/blob/main/dev/tasks/r/github.linux.arrow.version.back.compat.yml) - [ ] (patch releases only) Update the package version in `ci/scripts/PKGBUILD`, `dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb`, `r/DESCRIPTION`, and `r/NEWS.md` -- [ ] (CRAN-only releases) Rebuild the docs with `pkgdown::build_site(examples = FALSE, lazy = TRUE, install = FALSE)` and submit a PR to [the `asf-site` branch of the docs site](https://github.com/apache/arrow-site) with the contents of `r/docs/news/index.html`. +- [ ] (CRAN-only releases) Rebuild news page with `pkgdown::build_news()` and submit a PR to the asf-site branch of the docs site with the contents of `arrow/r/docs/news/index.html` replacing the current contents of `arrow-site/docs/r/news/index.html` - [ ] (CRAN-only releases) Bump the version number in `r/pkgdown/assets/versions.json`, and update this on the [the `asf-site` branch of the docs site](https://github.com/apache/arrow-site) too. - [ ] Update the packaging checklist template to reflect any new realities of the packaging process. From 6695d18eb2d7b8eada68e80f0fb030c12b70aabb Mon Sep 17 00:00:00 2001 From: Chelsea Jones <129552306+chelseajonesr@users.noreply.github.com> Date: Mon, 24 Jul 2023 11:29:24 -0700 Subject: [PATCH 034/749] GH-36793: [Go] Allow NewSchemaFromStruct to skip fields if tagged with parquet:"-" (#36794) ### Rationale for this change Allow skipping Go struct fields when serializing to Parquet by using a tag `parquet:"-"`, similarly to the standard Go JSON implementation. ### What changes are included in this PR? Add `Exclude` to the taggedInfo struct, which is used by `typeToNode` to skip the associated struct field. ### Are these changes tested? Yes, I modified an existing test to add a new excluded field. (I'm also using this change locally to read and write parquet files with existing Go structs.) ### Are there any user-facing changes? Yes, this modifies usage of the `parquet` tag. I couldn't find any relevant documentation that needs to be updated though; if there is any please let me know and I will do so. * Closes: #36793 Authored-by: Chelsea Jones Signed-off-by: Matt Topol --- go/parquet/schema/reflection.go | 13 +++++++++++-- go/parquet/schema/reflection_test.go | 3 ++- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/go/parquet/schema/reflection.go b/go/parquet/schema/reflection.go index f1e204a171712..b85c1c28c781a 100644 --- a/go/parquet/schema/reflection.go +++ b/go/parquet/schema/reflection.go @@ -64,6 +64,8 @@ type taggedInfo struct { LogicalType LogicalType KeyLogicalType LogicalType ValueLogicalType LogicalType + + Exclude bool } func (t *taggedInfo) CopyForKey() (ret taggedInfo) { @@ -186,6 +188,7 @@ func newTaggedInfo() taggedInfo { LogicalType: NoLogicalType{}, KeyLogicalType: NoLogicalType{}, ValueLogicalType: NoLogicalType{}, + Exclude: false, } } @@ -232,6 +235,10 @@ func infoFromTags(f reflect.StructTag) *taggedInfo { if ptags, ok := f.Lookup("parquet"); ok { info := newTaggedInfo() + if ptags == "-" { + info.Exclude = true + return &info + } for _, tag := range strings.Split(strings.Replace(ptags, "\t", "", -1), ",") { tag = strings.TrimSpace(tag) kv := strings.SplitN(tag, "=", 2) @@ -370,8 +377,10 @@ func typeToNode(name string, typ reflect.Type, repType parquet.Repetition, info fields := make(FieldList, 0) for i := 0; i < typ.NumField(); i++ { f := typ.Field(i) - - fields = append(fields, typeToNode(f.Name, f.Type, parquet.Repetitions.Required, infoFromTags(f.Tag))) + tags := infoFromTags(f.Tag) + if tags == nil || !tags.Exclude { + fields = append(fields, typeToNode(f.Name, f.Type, parquet.Repetitions.Required, tags)) + } } // group nodes don't have a physical type if physical != parquet.Types.Undefined { diff --git a/go/parquet/schema/reflection_test.go b/go/parquet/schema/reflection_test.go index 7be1475513c52..4a029d058155a 100644 --- a/go/parquet/schema/reflection_test.go +++ b/go/parquet/schema/reflection_test.go @@ -309,7 +309,8 @@ func TestStructFromSchema(t *testing.T) { func TestStructFromSchemaWithNesting(t *testing.T) { type Other struct { - List *[]*float32 + List *[]*float32 + Excluded int32 `parquet:"-"` } type Nested struct { From 651c875c3f853bc7439a10cca2cf6e04e59c5fdd Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Mon, 24 Jul 2023 18:07:41 -0400 Subject: [PATCH 035/749] GH-35800: [Docs] Link to GeoArrow from canonical extension types docs (#36810) Adds a new section to the Canonical Extension Types docs listing "Community Extension Types" and explains that these have not been formally adopted by the Arrow developers. Lists GeoArrow as one of these and links to the GeoArrow repo. * Closes: #35800 --- docs/source/format/CanonicalExtensions.rst | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index 5dd269ee5c675..9f7948cbfe980 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -147,3 +147,21 @@ Fixed shape tensor This structure has no relationship with the Fixed shape tensor extension type defined by this specification. Instead, this extension type lets one use fixed shape tensors as elements in a field of a RecordBatch or a Table. + +========================= +Community Extension Types +========================= + +In addition to the canonical extension types listed above, there exist Arrow +extension types that have been established as standards within specific domain +areas. These have not been officially designated as canonical through a +discussion and vote on the Arrow development mailing list but are well known +within subcommunities of Arrow developers. + +GeoArrow +======== + +`GeoArrow `_ defines a collection of +Arrow extension types for representing vector geometries. It is well known +within the Arrow geospatial subcommunity. The GeoArrow specification is not yet +finalized. From 06deded47a4571c40bee06fd28749bfd5633a823 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Tue, 25 Jul 2023 02:42:23 +0200 Subject: [PATCH 036/749] GH-36839: [CI][Docs] Update test-ubuntu-default-docs to use GitHub actions instead of Azure (#36840) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Currently `test-ubuntu-default-docs` has been failing on Azure for the 13.0.0 RC0 and we had to use GitHub actions to generate the documentation. Using the same base action for both preview-docs, test and packaging will improve maintainability. ### What changes are included in this PR? Move `test-ubuntu-default-docs` to use GH actions instead of Azure. ### Are these changes tested? Yes, with archery related tasks. ### Are there any user-facing changes? No * Closes: #36839 Authored-by: Raúl Cumplido Signed-off-by: Sutou Kouhei --- dev/tasks/docs/github.linux.yml | 12 ++++++++++-- dev/tasks/tasks.yml | 13 +++++-------- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/dev/tasks/docs/github.linux.yml b/dev/tasks/docs/github.linux.yml index f9b2e111e8f3e..0938ac74adc46 100644 --- a/dev/tasks/docs/github.linux.yml +++ b/dev/tasks/docs/github.linux.yml @@ -21,7 +21,7 @@ jobs: test: - name: Docs Preview + name: Build Docs runs-on: ubuntu-latest {{ macros.github_set_env(env) }} steps: @@ -44,7 +44,8 @@ jobs: ref: {{ default_branch|default("main") }} path: crossbow fetch-depth: 1 - - name: Prepare docs + {% if publish %} + - name: Prepare Docs Preview run: | # build files are created by the docker user sudo chown -R ${USER}: build @@ -61,3 +62,10 @@ jobs: run: | aws s3 cp build/docs/ $BUCKET/pr_docs/{{ pr_number }}/ --recursive echo ":open_book: You can find the preview here: http://crossbow.voltrondata.com/pr_docs/{{ pr_number }}" >> $GITHUB_STEP_SUMMARY + {% endif %} + - name: Prepare Docs artifacts + run: | + cd build + sudo chown -R ${USER}: . + tar cvzf docs.tar.gz docs + {{ macros.github_upload_releases("build/docs.tar.gz")|indent }} diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 879c2246b41ee..cca770438574a 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1435,16 +1435,13 @@ tasks: {% endfor %} test-ubuntu-default-docs: - ci: azure - template: docker-tests/azure.linux.yml + ci: github + template: docs/github.linux.yml params: - artifacts: "build/docs.tar.gz" + pr_number: Unset flags: "-v $PWD/build/:/build/" image: ubuntu-docs - post_script: | - cd build - sudo chown -R ${USER}: . - tar cvzf docs.tar.gz docs + publish: false artifacts: - docs.tar.gz @@ -1565,6 +1562,6 @@ tasks: template: docs/github.linux.yml params: pr_number: Unset - artifacts: "build/docs.tar.gz" flags: "-v $PWD/build/:/build/" image: ubuntu-docs + publish: true From 9bf514a874baf8a19ef4a5a2fb2fea676e57b2ed Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 25 Jul 2023 17:18:43 +0900 Subject: [PATCH 037/749] GH-35292: [Release] Retry "apt install" (#36836) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Timeout is still happen on my local environment. ### What changes are included in this PR? Retry `apt install`. This is just a workaround. We should not close GH-35292 by this. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * Closes: #35292 Authored-by: Sutou Kouhei Signed-off-by: Raúl Cumplido --- dev/release/verify-apt.sh | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/dev/release/verify-apt.sh b/dev/release/verify-apt.sh index 187482cbf52d2..49671f01cc7e8 100755 --- a/dev/release/verify-apt.sh +++ b/dev/release/verify-apt.sh @@ -45,7 +45,21 @@ echo "::group::Prepare repository" export DEBIAN_FRONTEND=noninteractive -APT_INSTALL="apt install -y -V --no-install-recommends" +retry() +{ + local n_retries=0 + local max_n_retries=3 + while ! "$@"; do + n_retries=$((n_retries + 1)) + if [ ${n_retries} -eq ${max_n_retries} ]; then + echo "Failed: $@" + return 1 + fi + echo "Retry: $@" + done +} + +APT_INSTALL="retry apt install -y -V --no-install-recommends" apt update ${APT_INSTALL} \ From 20bbe3e0966309a775bac0ef056b74b2b06fc75a Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 25 Jul 2023 17:26:41 +0900 Subject: [PATCH 038/749] GH-36832: [Packaging][RPM] Remove needless Requires (#36833) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change `arrowXX-libs` doesn't use `gflags` but it depends on `gflags`. ### What changes are included in this PR? Remove needless explicit `Requires`. ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. * Closes: #36832 Authored-by: Sutou Kouhei Signed-off-by: Raúl Cumplido --- .../apache-arrow/yum/arrow.spec.in | 21 ------------------- 1 file changed, 21 deletions(-) diff --git a/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in b/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in index 67e0f8db8a4dd..46c6d91b2d5dc 100644 --- a/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in +++ b/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in @@ -238,27 +238,11 @@ cd - %package -n %{name}%{major_version}-libs Summary: Runtime libraries for Apache Arrow C++ License: Apache-2.0 -Requires: brotli -%if %{use_gflags} -Requires: gflags -%endif -%if %{use_glog} -Requires: glog -%endif -Requires: libzstd %if %{have_lz4_libs} Requires: lz4-libs %{lz4_requirement} %else Requires: lz4 %{lz4_requirement} %endif -%if %{have_re2} -Requires: re2 -%endif -Requires: snappy -%if %{have_utf8proc} -Requires: utf8proc -%endif -Requires: zlib %description -n %{name}%{major_version}-libs This package contains the libraries for Apache Arrow C++. @@ -414,8 +398,6 @@ Libraries and header files for Apache Arrow dataset. Summary: C++ library for fast data transport. License: Apache-2.0 Requires: %{name}%{major_version}-libs = %{version}-%{release} -Requires: c-ares -Requires: openssl %description -n %{name}%{major_version}-flight-libs This package contains the libraries for Apache Arrow Flight. @@ -485,7 +467,6 @@ Libraries and header files for Apache Arrow Flight SQL. Summary: C++ library for compiling and evaluating expressions on Apache Arrow data. License: Apache-2.0 Requires: %{name}%{major_version}-libs = %{version}-%{release} -Requires: ncurses-libs %description -n gandiva%{major_version}-libs This package contains the libraries for Gandiva. @@ -521,7 +502,6 @@ Libraries and header files for Gandiva. Summary: Runtime libraries for Apache Parquet C++ License: Apache-2.0 Requires: %{name}%{major_version}-libs = %{version}-%{release} -Requires: openssl %description -n parquet%{major_version}-libs This package contains the libraries for Apache Parquet C++. @@ -570,7 +550,6 @@ Libraries and header files for Apache Parquet C++. Summary: Runtime libraries for Apache Arrow GLib License: Apache-2.0 Requires: %{name}%{major_version}-libs = %{version}-%{release} -Requires: glib2 %description -n %{name}%{major_version}-glib-libs This package contains the libraries for Apache Arrow GLib. From dbae5f07b62ccc4be6ab1f47d72eaf054d24ae05 Mon Sep 17 00:00:00 2001 From: Jin Shang Date: Tue, 25 Jul 2023 16:43:14 +0800 Subject: [PATCH 039/749] GH-36789: [C++] Support divide(duration, duration) (#36800) ### Rationale for this change Support divide(duration, duration), as pandas and numpy already do. ### What changes are included in this PR? Add kernels divide(duration, duration)->float64 and divide_checked(duration, duration)->float64 ### Are these changes tested? Yes ### Are there any user-facing changes? No * Closes: #36789 Authored-by: Jin Shang Signed-off-by: Antoine Pitrou --- .../compute/kernels/scalar_arithmetic.cc | 16 +++++++++++++++ .../compute/kernels/scalar_temporal_test.cc | 20 +++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc index 2c7363b3ca486..c305028be19c9 100644 --- a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc +++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc @@ -30,6 +30,7 @@ #include "arrow/compute/kernels/common_internal.h" #include "arrow/compute/kernels/util_internal.h" #include "arrow/type.h" +#include "arrow/type_fwd.h" #include "arrow/type_traits.h" #include "arrow/util/decimal.h" #include "arrow/util/int_util_overflow.h" @@ -1509,6 +1510,13 @@ void RegisterScalarArithmetic(FunctionRegistry* registry) { DCHECK_OK( divide->AddKernel({duration(unit), int64()}, duration(unit), std::move(exec))); } + + // Add divide(duration, duration) -> float64 + for (auto unit : TimeUnit::values()) { + auto exec = ScalarBinaryNotNull::Exec; + DCHECK_OK( + divide->AddKernel({duration(unit), duration(unit)}, float64(), std::move(exec))); + } DCHECK_OK(registry->AddFunction(std::move(divide))); // ---------------------------------------------------------------------- @@ -1523,6 +1531,14 @@ void RegisterScalarArithmetic(FunctionRegistry* registry) { std::move(exec))); } + // Add divide_checked(duration, duration) -> float64 + for (auto unit : TimeUnit::values()) { + auto exec = + ScalarBinaryNotNull::Exec; + DCHECK_OK(divide_checked->AddKernel({duration(unit), duration(unit)}, float64(), + std::move(exec))); + } + DCHECK_OK(registry->AddFunction(std::move(divide_checked))); // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc index cd8abf6e923c8..4c7975add0308 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc @@ -26,6 +26,7 @@ #include "arrow/testing/matchers.h" #include "arrow/testing/util.h" #include "arrow/type.h" +#include "arrow/type_fwd.h" #include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" #include "arrow/util/formatting.h" @@ -1695,6 +1696,7 @@ TEST_F(ScalarTemporalTest, TestTemporalMultiplyDuration) { } TEST_F(ScalarTemporalTest, TestTemporalDivideDuration) { + // div(duration, integer) -> integer for (auto u : TimeUnit::values()) { for (auto numeric : NumericTypes()) { if (!is_integer(numeric->id())) continue; @@ -1718,6 +1720,24 @@ TEST_F(ScalarTemporalTest, TestTemporalDivideDuration) { CallFunction("divide_checked", {durations, zeros})); } } + + // div(duration, duration) -> float64 + auto left = ArrayFromJSON(duration(TimeUnit::SECOND), "[1, 2, 3, 4]"); + auto right = ArrayFromJSON(duration(TimeUnit::MILLI), "[4000, 300, 20, 1]"); + auto expected_left_by_right = + ArrayFromJSON(float64(), "[0.25, 6.666666666666667, 150, 4000]"); + auto expected_right_by_left = + ArrayFromJSON(float64(), "[4, 0.15, 0.006666666666666667, 0.00025]"); + CheckScalarBinary("divide", left, right, expected_left_by_right); + CheckScalarBinary("divide_checked", left, right, expected_left_by_right); + CheckScalarBinary("divide", right, left, expected_right_by_left); + CheckScalarBinary("divide_checked", right, left, expected_right_by_left); + + // Check dispatching + CheckDispatchBest("divide", {duration(TimeUnit::SECOND), duration(TimeUnit::MILLI)}, + {duration(TimeUnit::MILLI), duration(TimeUnit::MILLI)}); + CheckDispatchBest("divide", {duration(TimeUnit::NANO), duration(TimeUnit::MILLI)}, + {duration(TimeUnit::NANO), duration(TimeUnit::NANO)}); } TEST_F(ScalarTemporalTest, TestTemporalDifferenceWeeks) { From 16806a1486f644093b9578243cc55ac051c60056 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 25 Jul 2023 10:56:09 -0300 Subject: [PATCH 040/749] GH-36776: [C++] Make ListArray::FromArrays() handle sliced offsets Arrays containing nulls (#36780) ### Rationale for this change It fixes the issue described in #36776 ### What changes are included in this PR? The fix and a simplifications of the interaction between `ListArrayFromArrays` and `CleanOffsets` which is rather involved. ### Are these changes tested? Yes. I added a test that reproduces the issue before adding the fixes. * Closes: #36776 Authored-by: Felipe Oliveira Carvalho Signed-off-by: Antoine Pitrou --- cpp/src/arrow/array/array_list_test.cc | 57 +++++++++++++++++ cpp/src/arrow/array/array_nested.cc | 87 +++++++++++++++----------- 2 files changed, 106 insertions(+), 38 deletions(-) diff --git a/cpp/src/arrow/array/array_list_test.cc b/cpp/src/arrow/array/array_list_test.cc index 2a00cadcab9aa..a3a2f99851b55 100644 --- a/cpp/src/arrow/array/array_list_test.cc +++ b/cpp/src/arrow/array/array_list_test.cc @@ -215,17 +215,20 @@ class TestListArray : public ::testing::Test { // Offsets with nulls will match. ASSERT_OK_AND_ASSIGN(auto result, ArrayType::FromArrays(*offsets_w_nulls, *values, pool_)); + ASSERT_OK(result->ValidateFull()); AssertArraysEqual(*result, *expected); // Offets without nulls, will replace null with empty list ASSERT_OK_AND_ASSIGN(result, ArrayType::FromArrays(*offsets_wo_nulls, *values, pool_)); + ASSERT_OK(result->ValidateFull()); AssertArraysEqual(*result, *std::dynamic_pointer_cast( ArrayFromJSON(type, "[[0], [], [0, null], [0]]"))); // Specify non-null offsets with null_bitmap ASSERT_OK_AND_ASSIGN(result, ArrayType::FromArrays(*offsets_wo_nulls, *values, pool_, expected->null_bitmap())); + ASSERT_OK(result->ValidateFull()); AssertArraysEqual(*result, *expected); // Cannot specify both null offsets with null_bitmap @@ -233,6 +236,58 @@ class TestListArray : public ::testing::Test { expected->null_bitmap())); } + void TestFromArraysWithSlicedOffsets() { + std::vector offsets = {-1, -1, 0, 1, 2, 4}; + + std::shared_ptr offsets_wo_nulls; + ArrayFromVector(offsets, &offsets_wo_nulls); + + auto type = std::make_shared(int32()); + auto expected = std::dynamic_pointer_cast( + ArrayFromJSON(type, "[[0], [1], [0, null]]")); + auto values = expected->values(); + + // Apply an offset to the offsets array + auto sliced_offsets = offsets_wo_nulls->Slice(2, 4); + ASSERT_OK_AND_ASSIGN(auto result, + ArrayType::FromArrays(*sliced_offsets, *values, pool_)); + ASSERT_OK(result->ValidateFull()); + AssertArraysEqual(*result, *expected); + + // Non-zero starter offset + sliced_offsets = offsets_wo_nulls->Slice(3, 3); + ASSERT_OK_AND_ASSIGN(result, ArrayType::FromArrays(*sliced_offsets, *values, pool_)); + ASSERT_OK(result->ValidateFull()); + AssertArraysEqual(*result, *expected->Slice(1, 2)); + } + + void TestFromArraysWithSlicedNullOffsets() { + std::vector offsets = {-1, -1, 0, 1, 1, 3}; + std::vector offsets_w_nulls_is_valid = {true, true, true, false, true, true}; + + std::shared_ptr offsets_w_nulls; + ArrayFromVector(offsets_w_nulls_is_valid, offsets, + &offsets_w_nulls); + + auto type = std::make_shared(int32()); + auto expected = std::dynamic_pointer_cast( + ArrayFromJSON(type, "[[0], null, [0, null]]")); + auto values = expected->values(); + + // Apply an offset to the offsets array with nulls (GH-36776) + auto sliced_offsets = offsets_w_nulls->Slice(2, 4); + ASSERT_OK_AND_ASSIGN(auto result, + ArrayType::FromArrays(*sliced_offsets, *values, pool_)); + ASSERT_OK(result->ValidateFull()); + AssertArraysEqual(*result, *expected); + + // Non-zero starter offset + sliced_offsets = offsets_w_nulls->Slice(3, 3); + ASSERT_OK_AND_ASSIGN(result, ArrayType::FromArrays(*sliced_offsets, *values, pool_)); + ASSERT_OK(result->ValidateFull()); + AssertArraysEqual(*result, *expected->Slice(1, 2)); + } + void TestFromArrays() { std::shared_ptr offsets1, offsets2, offsets3, offsets4, offsets5, values; @@ -586,6 +641,8 @@ TYPED_TEST(TestListArray, FromArrays) { this->TestFromArrays(); } TYPED_TEST(TestListArray, FromArraysWithNullBitMap) { this->TestFromArraysWithNullBitMap(); + this->TestFromArraysWithSlicedOffsets(); + this->TestFromArraysWithSlicedNullOffsets(); } TYPED_TEST(TestListArray, AppendNull) { this->TestAppendNull(); } diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index 61eeb496e5a5b..df60074c78470 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -52,6 +52,9 @@ using internal::CopyBitmap; namespace { +/// \brief Clean offsets when their null_count is greater than 0 +/// +/// \pre offsets.null_count() > 0 template Result CleanListOffsets(const std::shared_ptr& validity_buffer, const Array& offsets, MemoryPool* pool) { @@ -59,43 +62,36 @@ Result CleanListOffsets(const std::shared_ptr& validity_bu using OffsetArrowType = typename CTypeTraits::ArrowType; using OffsetArrayType = typename TypeTraits::ArrayType; - const auto& typed_offsets = checked_cast(offsets); + DCHECK_GT(offsets.null_count(), 0); const int64_t num_offsets = offsets.length(); - DCHECK(validity_buffer == nullptr || offsets.null_count() == 0) - << "When a validity_buffer is passed, offsets must have no nulls"; + if (!offsets.IsValid(num_offsets - 1)) { + return Status::Invalid("Last list offset should be non-null"); + } - if (offsets.null_count() > 0) { - if (!offsets.IsValid(num_offsets - 1)) { - return Status::Invalid("Last list offset should be non-null"); - } + ARROW_ASSIGN_OR_RAISE(auto clean_offsets, + AllocateBuffer(num_offsets * sizeof(offset_type), pool)); - ARROW_ASSIGN_OR_RAISE(auto clean_offsets, - AllocateBuffer(num_offsets * sizeof(offset_type), pool)); + // Copy valid bits, ignoring the final offset (since for a length N list array, + // we have N + 1 offsets) + ARROW_ASSIGN_OR_RAISE( + auto clean_validity_buffer, + CopyBitmap(pool, offsets.null_bitmap()->data(), offsets.offset(), num_offsets - 1)); - // Copy valid bits, ignoring the final offset (since for a length N list array, - // we have N + 1 offsets) - ARROW_ASSIGN_OR_RAISE( - auto clean_validity_buffer, - offsets.null_bitmap()->CopySlice(0, bit_util::BytesForBits(num_offsets - 1))); - - const offset_type* raw_offsets = typed_offsets.raw_values(); - auto clean_raw_offsets = - reinterpret_cast(clean_offsets->mutable_data()); - - // Must work backwards so we can tell how many values were in the last non-null value - offset_type current_offset = raw_offsets[num_offsets - 1]; - for (int64_t i = num_offsets - 1; i >= 0; --i) { - if (offsets.IsValid(i)) { - current_offset = raw_offsets[i]; - } - clean_raw_offsets[i] = current_offset; - } + const offset_type* raw_offsets = + checked_cast(offsets).raw_values(); + auto clean_raw_offsets = reinterpret_cast(clean_offsets->mutable_data()); - return BufferVector({std::move(clean_validity_buffer), std::move(clean_offsets)}); + // Must work backwards so we can tell how many values were in the last non-null value + offset_type current_offset = raw_offsets[num_offsets - 1]; + for (int64_t i = num_offsets - 1; i >= 0; --i) { + if (offsets.IsValid(i)) { + current_offset = raw_offsets[i]; + } + clean_raw_offsets[i] = current_offset; } - return BufferVector({validity_buffer, typed_offsets.values()}); + return BufferVector({std::move(clean_validity_buffer), std::move(clean_offsets)}); } template @@ -124,14 +120,21 @@ Result::ArrayType>> ListArrayFromArray return Status::NotImplemented("Null bitmap with offsets slice not supported."); } - std::shared_ptr offset_buf, validity_buf; - ARROW_ASSIGN_OR_RAISE(auto buffers, CleanListOffsets(null_bitmap, offsets, pool)); - int64_t null_count_ = null_bitmap ? null_count : offsets.null_count(); + // Clean the offsets if they contain nulls. + if (offsets.null_count() > 0) { + ARROW_ASSIGN_OR_RAISE(auto buffers, + CleanListOffsets(null_bitmap, offsets, pool)); + auto data = ArrayData::Make(type, offsets.length() - 1, std::move(buffers), + {values.data()}, offsets.null_count(), /*offset=*/0); + return std::make_shared(std::move(data)); + } - std::shared_ptr internal_data = ArrayData::Make( - type, offsets.length() - 1, std::move(buffers), null_count_, offsets.offset()); - internal_data->child_data.push_back(values.data()); - return std::make_shared(internal_data); + using OffsetArrayType = typename TypeTraits::ArrayType; + const auto& typed_offsets = checked_cast(offsets); + auto buffers = BufferVector({std::move(null_bitmap), typed_offsets.values()}); + auto data = ArrayData::Make(type, offsets.length() - 1, std::move(buffers), + {values.data()}, null_count, offsets.offset()); + return std::make_shared(std::move(data)); } static std::shared_ptr SliceArrayWithOffsets(const Array& array, int64_t begin, @@ -374,10 +377,18 @@ Result> MapArray::FromArraysInternal( return Status::Invalid("Map key and item arrays must be equal length"); } - ARROW_ASSIGN_OR_RAISE(auto buffers, CleanListOffsets(NULLPTR, *offsets, pool)); + if (offsets->null_count() > 0) { + ARROW_ASSIGN_OR_RAISE(auto buffers, + CleanListOffsets(NULLPTR, *offsets, pool)); + return std::make_shared(type, offsets->length() - 1, std::move(buffers), + keys, items, offsets->null_count(), 0); + } + using OffsetArrayType = typename TypeTraits::ArrayType; + const auto& typed_offsets = checked_cast(*offsets); + auto buffers = BufferVector({nullptr, typed_offsets.values()}); return std::make_shared(type, offsets->length() - 1, std::move(buffers), keys, - items, offsets->null_count(), offsets->offset()); + items, /*null_count=*/0, offsets->offset()); } Result> MapArray::FromArrays(const std::shared_ptr& offsets, From 7b1e8348793924a1d95a66e200a4c33daaaa76cd Mon Sep 17 00:00:00 2001 From: Kevin Gurney Date: Tue, 25 Jul 2023 10:25:43 -0400 Subject: [PATCH 041/749] GH-36852: [MATLAB] Add `arrow.type.Field` class (#36855) ### Rationale for this change Now that the MATLAB interface supports creating `Type` objects - like `arrow.type.Int64Type`, we can add support for `Field` objects (i.e. name + type). This mirrors the `Field` type in other Arrow interfaces, [like PyArrow](https://arrow.apache.org/docs/python/generated/pyarrow.field.html). ### What changes are included in this PR? Two new user-facing APIs have been added: 1. `arrow.field(name, type)` constructor function 2. `arrow.type.Field` class (returned by the `arrow.field` constructor function). **Example**: ```matlab >> field = arrow.field("Speed", arrow.type.uint64) field = Speed: uint64 >> field.Type ans = UInt64Type with properties: ID: UInt64 >> field.Name ans = "Speed" ``` ### Are these changes tested? Yes. 1. Added new `tField.m` MATLAB test class. ### Are there any user-facing changes? Yes. Two new user-facing APIs have been added: 1. `arrow.field(name, type)` constructor function 2. `arrow.type.Field` class (returned by the `arrow.field` constructor function) ### Future Directions 1. We intentionally placed `arrow.field` in the top-level `arrow` package, rather than under a nested `arrow.type` package (where the corresponding class `arrow.type.Field` is). This is to avoid naming conflicts between `arrow.type.field` and `arrow.type.Field` and also to make it easier to use the recommended public/user-facing APIs with less typing (i.e. without needing to type nested package names). While working on this change, we realized that it would make sense to move the "type constructor functions" (e.g. `arrow.type.boolean`, `arrow.type.uint64`, etc.) into the top-level `arrow` package, as well (i.e. `arrow.boolean`, `arrow.uint64`, etc.). In other words, moving forward, the recommended APIs for the MATLAB interface will be placed directly under the top-level `arrow` package. This should hopefully result in a simplified public interface, as well as make it easier to use multiple language bindings (e.g. MATLAB and PyArrow) together, with less context switching. ### Notes 1. @ sgilmore10 is working on a follow up PR (to address #36853) for simplifying the `switch` statement code in `makeTypeProxy`. Her solution will be more generic, so that we can re-use it elsewhere across the code base of the MATLAB interface. 2. Thank you @ sgilmore10 for your help with this pull request! * Closes: #36852 Lead-authored-by: Kevin Gurney Co-authored-by: Kevin Gurney Co-authored-by: Sarah Gilmore Co-authored-by: Sutou Kouhei Signed-off-by: Kevin Gurney --- matlab/src/cpp/arrow/matlab/error/error.h | 1 + matlab/src/cpp/arrow/matlab/proxy/factory.cc | 2 + .../src/cpp/arrow/matlab/type/proxy/field.cc | 129 +++++++++++++++++ .../src/cpp/arrow/matlab/type/proxy/field.h | 44 ++++++ matlab/src/matlab/+arrow/+type/Field.m | 67 +++++++++ matlab/src/matlab/+arrow/field.m | 27 ++++ matlab/test/arrow/type/tField.m | 131 ++++++++++++++++++ .../cmake/BuildMatlabArrowInterface.cmake | 3 +- 8 files changed, 403 insertions(+), 1 deletion(-) create mode 100644 matlab/src/cpp/arrow/matlab/type/proxy/field.cc create mode 100644 matlab/src/cpp/arrow/matlab/type/proxy/field.h create mode 100644 matlab/src/matlab/+arrow/+type/Field.m create mode 100644 matlab/src/matlab/+arrow/field.m create mode 100644 matlab/test/arrow/type/tField.m diff --git a/matlab/src/cpp/arrow/matlab/error/error.h b/matlab/src/cpp/arrow/matlab/error/error.h index b1b7b75b8c84a..b253e6c20ed27 100644 --- a/matlab/src/cpp/arrow/matlab/error/error.h +++ b/matlab/src/cpp/arrow/matlab/error/error.h @@ -171,4 +171,5 @@ namespace arrow::matlab::error { static const char* STRING_BUILDER_APPEND_FAILED = "arrow:matlab:array:string:StringBuilderAppendFailed"; static const char* STRING_BUILDER_FINISH_FAILED = "arrow:matlab:array:string:StringBuilderFinishFailed"; static const char* UKNOWN_TIME_UNIT_ERROR_ID = "arrow:matlab:UnknownTimeUnit"; + static const char* FIELD_FAILED_TO_CREATE_TYPE_PROXY = "arrow:field:FailedToCreateTypeProxy"; } diff --git a/matlab/src/cpp/arrow/matlab/proxy/factory.cc b/matlab/src/cpp/arrow/matlab/proxy/factory.cc index 2fb3207e590c6..ac9a595a45852 100644 --- a/matlab/src/cpp/arrow/matlab/proxy/factory.cc +++ b/matlab/src/cpp/arrow/matlab/proxy/factory.cc @@ -23,6 +23,7 @@ #include "arrow/matlab/type/proxy/primitive_ctype.h" #include "arrow/matlab/type/proxy/string_type.h" #include "arrow/matlab/type/proxy/timestamp_type.h" +#include "arrow/matlab/type/proxy/field.h" #include "factory.h" @@ -43,6 +44,7 @@ libmexclass::proxy::MakeResult Factory::make_proxy(const ClassName& class_name, REGISTER_PROXY(arrow.array.proxy.StringArray , arrow::matlab::array::proxy::StringArray); REGISTER_PROXY(arrow.array.proxy.TimestampArray, arrow::matlab::array::proxy::NumericArray); REGISTER_PROXY(arrow.tabular.proxy.RecordBatch , arrow::matlab::tabular::proxy::RecordBatch); + REGISTER_PROXY(arrow.type.proxy.Field , arrow::matlab::type::proxy::Field); REGISTER_PROXY(arrow.type.proxy.Float32Type , arrow::matlab::type::proxy::PrimitiveCType); REGISTER_PROXY(arrow.type.proxy.Float64Type , arrow::matlab::type::proxy::PrimitiveCType); REGISTER_PROXY(arrow.type.proxy.UInt8Type , arrow::matlab::type::proxy::PrimitiveCType); diff --git a/matlab/src/cpp/arrow/matlab/type/proxy/field.cc b/matlab/src/cpp/arrow/matlab/type/proxy/field.cc new file mode 100644 index 0000000000000..4a43d813f0567 --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/type/proxy/field.cc @@ -0,0 +1,129 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/util/utf8.h" + +#include "arrow/matlab/type/proxy/field.h" +#include "arrow/matlab/error/error.h" + +#include "arrow/matlab/type/proxy/primitive_ctype.h" +#include "arrow/matlab/type/proxy/timestamp_type.h" +#include "arrow/matlab/type/proxy/string_type.h" + +#include "libmexclass/proxy/ProxyManager.h" + +namespace arrow::matlab::type::proxy { + + Field::Field(std::shared_ptr field) : field{std::move(field)} { + REGISTER_METHOD(Field, name); + REGISTER_METHOD(Field, type); + REGISTER_METHOD(Field, toString); + } + + std::shared_ptr Field::unwrap() { + return field; + } + + void Field::name(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + mda::ArrayFactory factory; + + const auto& str_utf8 = field->name(); + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto str_utf16, arrow::util::UTF8StringToUTF16(str_utf8), context, error::UNICODE_CONVERSION_ERROR_ID); + auto str_mda = factory.createScalar(str_utf16); + context.outputs[0] = str_mda; + } + + arrow::Result> makeTypeProxy(const std::shared_ptr& datatype) { + using arrow_type = arrow::Type::type; + namespace type_proxy = arrow::matlab::type::proxy; + switch (datatype->id()) { + case arrow_type::UINT8: + return std::make_shared>(std::static_pointer_cast(datatype)); + case arrow_type::UINT16: + return std::make_shared>(std::static_pointer_cast(datatype)); + case arrow_type::UINT32: + return std::make_shared>(std::static_pointer_cast(datatype)); + case arrow_type::UINT64: + return std::make_shared>(std::static_pointer_cast(datatype)); + case arrow_type::INT8: + return std::make_shared>(std::static_pointer_cast(datatype)); + case arrow_type::INT16: + return std::make_shared>(std::static_pointer_cast(datatype)); + case arrow_type::INT32: + return std::make_shared>(std::static_pointer_cast(datatype)); + case arrow_type::INT64: + return std::make_shared>(std::static_pointer_cast(datatype)); + case arrow_type::FLOAT: + return std::make_shared>(std::static_pointer_cast(datatype)); + case arrow_type::DOUBLE: + return std::make_shared>(std::static_pointer_cast(datatype)); + case arrow_type::BOOL: + return std::make_shared>(std::static_pointer_cast(datatype)); + case arrow_type::STRING: + return std::make_shared(std::static_pointer_cast(datatype)); + case arrow_type::TIMESTAMP: + return std::make_shared(std::static_pointer_cast(datatype)); + default: + return arrow::Status::NotImplemented("Unsupported DataType: " + datatype->ToString()); + } + } + + + void Field::type(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + + auto datatype = field->type(); + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto proxy, makeTypeProxy(datatype), context, "arrow:field:FailedToCreateTypeProxy"); + const auto proxy_id = libmexclass::proxy::ProxyManager::manageProxy(proxy); + + mda::ArrayFactory factory; + context.outputs[0] = factory.createScalar(proxy_id); + context.outputs[1] = factory.createScalar(static_cast(datatype->id())); + } + + void Field::toString(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + mda::ArrayFactory factory; + + const auto str_utf8 = field->ToString(); + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto str_utf16, arrow::util::UTF8StringToUTF16(str_utf8), context, error::UNICODE_CONVERSION_ERROR_ID); + auto str_mda = factory.createScalar(str_utf16); + context.outputs[0] = str_mda; + } + + libmexclass::proxy::MakeResult Field::make(const libmexclass::proxy::FunctionArguments& constructor_arguments) { + namespace mda = ::matlab::data; + using FieldProxy = arrow::matlab::type::proxy::Field; + + mda::StructArray opts = constructor_arguments[0]; + const mda::StringArray name_mda = opts[0]["Name"]; + const mda::TypedArray type_proxy_id_mda = opts[0]["TypeProxyID"]; + + const std::u16string& name_utf16 = name_mda[0]; + MATLAB_ASSIGN_OR_ERROR(const auto name, + arrow::util::UTF16StringToUTF8(name_utf16), + error::UNICODE_CONVERSION_ERROR_ID); + + auto proxy = std::static_pointer_cast(libmexclass::proxy::ProxyManager::getProxy(type_proxy_id_mda[0])); + auto type = proxy->unwrap(); + auto field = arrow::field(name, type); + return std::make_shared(std::move(field)); + } + +} + diff --git a/matlab/src/cpp/arrow/matlab/type/proxy/field.h b/matlab/src/cpp/arrow/matlab/type/proxy/field.h new file mode 100644 index 0000000000000..8df73aa8af3a2 --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/type/proxy/field.h @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/type.h" + +#include "libmexclass/proxy/Proxy.h" + +namespace arrow::matlab::type::proxy { + +class Field : public libmexclass::proxy::Proxy { + public: + Field(std::shared_ptr field); + + virtual ~Field() {} + + std::shared_ptr unwrap(); + + static libmexclass::proxy::MakeResult make(const libmexclass::proxy::FunctionArguments& constructor_arguments); + + protected: + void name(libmexclass::proxy::method::Context& context); + void type(libmexclass::proxy::method::Context& context); + void toString(libmexclass::proxy::method::Context& context); + + std::shared_ptr field; +}; + +} diff --git a/matlab/src/matlab/+arrow/+type/Field.m b/matlab/src/matlab/+arrow/+type/Field.m new file mode 100644 index 0000000000000..aaab36b048e37 --- /dev/null +++ b/matlab/src/matlab/+arrow/+type/Field.m @@ -0,0 +1,67 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef Field < matlab.mixin.CustomDisplay +%FIELD A class representing a name and a type. +% Fields are often used in tabular schemas for describing a column's +% name and type. + + properties (GetAccess=public, SetAccess=private, Hidden) + Proxy + end + + properties (Dependent) + % Name of the field + Name + % Arrow type of the field + Type + end + + methods + function obj = Field(proxy) + arguments + proxy(1, 1) libmexclass.proxy.Proxy {validate(proxy, "arrow.type.proxy.Field")} + end + import arrow.internal.proxy.validate + + obj.Proxy = proxy; + end + + function type = get.Type(obj) + [proxyID, typeID] = obj.Proxy.type(); + traits = arrow.type.traits.traits(arrow.type.ID(typeID)); + proxy = libmexclass.proxy.Proxy(Name=traits.TypeProxyClassName, ID=proxyID); + type = traits.TypeConstructor(proxy); + end + + function name = get.Name(obj) + name = obj.Proxy.name(); + end + + end + + methods (Access = private) + function str = toString(obj) + str = obj.Proxy.toString(); + end + end + + methods (Access=protected) + function displayScalarObject(obj) + disp(obj.toString()); + end + end + +end diff --git a/matlab/src/matlab/+arrow/field.m b/matlab/src/matlab/+arrow/field.m new file mode 100644 index 0000000000000..a14ed2268bd35 --- /dev/null +++ b/matlab/src/matlab/+arrow/field.m @@ -0,0 +1,27 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +function f = field(name, type) +%FIELD Creates an arrow.type.Field object + arguments + name(1, 1) string {mustBeNonmissing} + type(1, 1) arrow.type.Type + end + + typeProxyID = type.Proxy.ID; + args = struct(Name=name, TypeProxyID=typeProxyID); + proxy = arrow.internal.proxy.create("arrow.type.proxy.Field", args); + f = arrow.type.Field(proxy); +end \ No newline at end of file diff --git a/matlab/test/arrow/type/tField.m b/matlab/test/arrow/type/tField.m new file mode 100644 index 0000000000000..9f0a8851591ee --- /dev/null +++ b/matlab/test/arrow/type/tField.m @@ -0,0 +1,131 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef tField < matlab.unittest.TestCase +% Test class for arrow.type.Field and arrow.field. + + methods(Test) + function TestBasic(testCase) + name = "A"; + type = arrow.type.uint64; + field = arrow.field(name, type); + + testCase.verifyEqual(field.Name, name); + testCase.verifyEqual(field.Type.ID, type.ID); + end + + function TestSupportedTypes(testCase) + name = "name"; + supportedTypes = { ... + arrow.type.uint8, ... + arrow.type.uint16, ... + arrow.type.uint32, ... + arrow.type.uint64, ... + arrow.type.int8, ... + arrow.type.int16, ... + arrow.type.int32, ... + arrow.type.int64, ... + arrow.type.boolean, ... + arrow.type.float32, ... + arrow.type.float64, ... + arrow.type.string, ... + arrow.type.timestamp, ... + }; + for ii = 1:numel(supportedTypes) + supportedType = supportedTypes{ii}; + field = arrow.field(name, supportedType); + testCase.verifyEqual(field.Name, name); + testCase.verifyEqual(field.Type.ID, supportedType.ID); + end + end + + function TestNameUnicode(testCase) + smiley = "😀"; + tree = "🌲"; + mango = "🥭"; + + type = arrow.type.uint64; + field = arrow.field(smiley, type); + + testCase.verifyEqual(field.Name, smiley); + testCase.verifyEqual(field.Type.ID, type.ID); + + field = arrow.field(tree, type); + + testCase.verifyEqual(field.Name, tree); + testCase.verifyEqual(field.Type.ID, type.ID); + + field = arrow.field(mango, type); + + testCase.verifyEqual(field.Name, mango); + testCase.verifyEqual(field.Type.ID, type.ID); + end + + function TestErrorIfNameStringMissing(testCase) + name = string(missing); + type = arrow.type.uint64; + testCase.verifyError(@() arrow.field(name, type), "MATLAB:validators:mustBeNonmissing"); + end + + function TestNameEmptyString(testCase) + name = ""; + type = arrow.type.uint64; + field = arrow.field(name, type); + + testCase.verifyEqual(field.Name, name); + testCase.verifyEqual(field.Type.ID, type.ID); + end + + function TestNameCharVector(testCase) + name = 'ABC'; + type = arrow.type.uint64; + field = arrow.field(name, type); + + testCase.verifyEqual(field.Name, string(name)); + testCase.verifyEqual(field.Type.ID, type.ID); + end + + function TestNameNumber(testCase) + name = 123; + type = arrow.type.uint64; + field = arrow.field(name, type); + + testCase.verifyEqual(field.Name, string(123)); + testCase.verifyEqual(field.Type.ID, type.ID); + end + + function TestArrowTypeUnsupportedInput(testCase) + name = "A"; + type = { 123 }; + testCase.verifyError(@() arrow.field(name, type), "MATLAB:validation:UnableToConvert"); + end + + function TestNameUnsupportedInput(testCase) + name = table(); + type = arrow.type.uint64; + testCase.verifyError(@() arrow.field(name, type), "MATLAB:validation:UnableToConvert"); + end + + function TestImmutableProperties(testCase) + name = "A"; + type = arrow.type.uint64; + field = arrow.field(name, type); + + testCase.verifyError(@() setfield(field, "Name", "NewValue"), "MATLAB:class:noSetMethod") + testCase.verifyError(@() setfield(field, "Type", arrow.type.boolean), "MATLAB:class:noSetMethod") + end + + end +end diff --git a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake index c10ce07280fa6..c5a7c08aa5c10 100644 --- a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake +++ b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake @@ -51,7 +51,8 @@ set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_SOURCES "${CMAKE_SOURCE_DIR}/src/cpp/a "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/type.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/fixed_width_type.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/string_type.cc" - "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/timestamp_type.cc") + "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/timestamp_type.cc" + "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/field.cc") set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_FACTORY_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/proxy") set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_FACTORY_SOURCES "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/proxy/factory.cc") From 969f4b439c8dd9052372fe6c652dbd2459678042 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Tue, 25 Jul 2023 10:33:56 -0400 Subject: [PATCH 042/749] GH-36488: [C++] Import/Export ArrowDeviceArray (#36489) ### Rationale for this change With the addition of the `ArrowDeviceArray` we should provide import/export functions just like we have for `ArrowArray`. ### What changes are included in this PR? Adding Import/Export functions to the `bridge.h/.cc` files for C Data Non-CPU data. This requires adding a device type to buffers and memory managers to propagate through. ### Are these changes tested? Yes. (Still need to add tests to `bridge_test.cc` but I wanted to get eyes on this first) ### Are there any user-facing changes? No. * Closes: #36488 Lead-authored-by: Matt Topol Co-authored-by: Weston Pace Signed-off-by: Matt Topol --- cpp/src/arrow/buffer.h | 22 +- cpp/src/arrow/buffer_test.cc | 14 ++ cpp/src/arrow/c/bridge.cc | 192 +++++++++++++- cpp/src/arrow/c/bridge.h | 130 ++++++++++ cpp/src/arrow/c/bridge_test.cc | 404 ++++++++++++++++++++++++++++++ cpp/src/arrow/device.h | 28 +++ cpp/src/arrow/gpu/cuda_context.cc | 3 +- cpp/src/arrow/gpu/cuda_context.h | 4 + cpp/src/arrow/gpu/cuda_memory.cc | 21 ++ cpp/src/arrow/gpu/cuda_memory.h | 8 +- cpp/src/arrow/gpu/cuda_test.cc | 4 + 11 files changed, 823 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h index ad2496aeeb5a6..08a3bd749e25d 100644 --- a/cpp/src/arrow/buffer.h +++ b/cpp/src/arrow/buffer.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -58,18 +59,31 @@ class ARROW_EXPORT Buffer { /// /// \note The passed memory must be kept alive through some other means Buffer(const uint8_t* data, int64_t size) - : is_mutable_(false), is_cpu_(true), data_(data), size_(size), capacity_(size) { + : is_mutable_(false), + is_cpu_(true), + data_(data), + size_(size), + capacity_(size), + device_type_(DeviceAllocationType::kCPU) { SetMemoryManager(default_cpu_memory_manager()); } Buffer(const uint8_t* data, int64_t size, std::shared_ptr mm, - std::shared_ptr parent = NULLPTR) + std::shared_ptr parent = NULLPTR, + std::optional device_type = std::nullopt) : is_mutable_(false), data_(data), size_(size), capacity_(size), parent_(std::move(parent)) { + // SetMemoryManager will also set device_type_ SetMemoryManager(std::move(mm)); + // if a device type is specified, use that instead. for example: + // CUDA_HOST. The CudaMemoryManager will set device_type_ to CUDA, + // but you can specify CUDA_HOST as the device type to override it. + if (device_type != std::nullopt) { + device_type_ = device_type; + } } Buffer(uintptr_t address, int64_t size, std::shared_ptr mm, @@ -282,6 +296,8 @@ class ARROW_EXPORT Buffer { const std::shared_ptr& memory_manager() const { return memory_manager_; } + std::optional device_type() const { return device_type_; } + std::shared_ptr parent() const { return parent_; } /// \brief Get a RandomAccessFile for reading a buffer @@ -336,6 +352,7 @@ class ARROW_EXPORT Buffer { const uint8_t* data_; int64_t size_; int64_t capacity_; + std::optional device_type_; // null by default, but may be set std::shared_ptr parent_; @@ -353,6 +370,7 @@ class ARROW_EXPORT Buffer { void SetMemoryManager(std::shared_ptr mm) { memory_manager_ = std::move(mm); is_cpu_ = memory_manager_->is_cpu(); + device_type_ = memory_manager_->device()->device_type(); } }; diff --git a/cpp/src/arrow/buffer_test.cc b/cpp/src/arrow/buffer_test.cc index ce8bab846d586..3dd95cb8af5c6 100644 --- a/cpp/src/arrow/buffer_test.cc +++ b/cpp/src/arrow/buffer_test.cc @@ -41,6 +41,7 @@ using internal::checked_cast; using internal::checked_pointer_cast; static const char kMyDeviceTypeName[] = "arrowtest::MyDevice"; +static const DeviceAllocationType kMyDeviceType = DeviceAllocationType::kEXT_DEV; static const int kMyDeviceAllowCopy = 1; static const int kMyDeviceAllowView = 2; @@ -70,6 +71,8 @@ class MyDevice : public Device { return checked_cast(other).value_ == value_; } + DeviceAllocationType device_type() const override { return kMyDeviceType; } + std::shared_ptr default_memory_manager() override; int value() const { return value_; } @@ -256,6 +259,7 @@ TEST_F(TestDevice, Copy) { ASSERT_EQ(buffer->device(), cpu_device_); ASSERT_TRUE(buffer->is_cpu()); ASSERT_NE(buffer->address(), cpu_src_->address()); + ASSERT_EQ(buffer->device_type(), DeviceAllocationType::kCPU); ASSERT_NE(buffer->data(), nullptr); AssertBufferEqual(*buffer, "some data"); @@ -263,6 +267,7 @@ TEST_F(TestDevice, Copy) { ASSERT_EQ(buffer->device(), cpu_device_); ASSERT_TRUE(buffer->is_cpu()); ASSERT_NE(buffer->address(), cpu_src_->address()); + ASSERT_EQ(buffer->device_type(), DeviceAllocationType::kCPU); ASSERT_NE(buffer->data(), nullptr); AssertBufferEqual(*buffer, "some data"); @@ -271,6 +276,7 @@ TEST_F(TestDevice, Copy) { ASSERT_EQ(buffer->device(), my_copy_device_); ASSERT_FALSE(buffer->is_cpu()); ASSERT_NE(buffer->address(), cpu_src_->address()); + ASSERT_EQ(buffer->device_type(), kMyDeviceType); #ifdef NDEBUG ASSERT_EQ(buffer->data(), nullptr); #endif @@ -280,6 +286,7 @@ TEST_F(TestDevice, Copy) { ASSERT_EQ(buffer->device(), my_copy_device_); ASSERT_FALSE(buffer->is_cpu()); ASSERT_NE(buffer->address(), cpu_src_->address()); + ASSERT_EQ(buffer->device_type(), kMyDeviceType); #ifdef NDEBUG ASSERT_EQ(buffer->data(), nullptr); #endif @@ -290,6 +297,7 @@ TEST_F(TestDevice, Copy) { ASSERT_EQ(buffer->device(), cpu_device_); ASSERT_TRUE(buffer->is_cpu()); ASSERT_NE(buffer->address(), my_copy_src_->address()); + ASSERT_EQ(buffer->device_type(), DeviceAllocationType::kCPU); ASSERT_NE(buffer->data(), nullptr); AssertBufferEqual(*buffer, "some data"); @@ -297,6 +305,7 @@ TEST_F(TestDevice, Copy) { ASSERT_EQ(buffer->device(), cpu_device_); ASSERT_TRUE(buffer->is_cpu()); ASSERT_NE(buffer->address(), my_copy_src_->address()); + ASSERT_EQ(buffer->device_type(), DeviceAllocationType::kCPU); ASSERT_NE(buffer->data(), nullptr); AssertBufferEqual(*buffer, "some data"); @@ -305,6 +314,7 @@ TEST_F(TestDevice, Copy) { ASSERT_EQ(buffer->device(), my_copy_device_); ASSERT_FALSE(buffer->is_cpu()); ASSERT_NE(buffer->address(), my_copy_src_->address()); + ASSERT_EQ(buffer->device_type(), kMyDeviceType); #ifdef NDEBUG ASSERT_EQ(buffer->data(), nullptr); #endif @@ -315,6 +325,7 @@ TEST_F(TestDevice, Copy) { ASSERT_EQ(buffer->device(), my_copy_device_); ASSERT_FALSE(buffer->is_cpu()); ASSERT_NE(buffer->address(), my_copy_src_->address()); + ASSERT_EQ(buffer->device_type(), kMyDeviceType); #ifdef NDEBUG ASSERT_EQ(buffer->data(), nullptr); #endif @@ -330,6 +341,7 @@ TEST_F(TestDevice, View) { ASSERT_EQ(buffer->device(), cpu_device_); ASSERT_TRUE(buffer->is_cpu()); ASSERT_EQ(buffer->address(), cpu_src_->address()); + ASSERT_EQ(buffer->device_type(), DeviceAllocationType::kCPU); ASSERT_NE(buffer->data(), nullptr); AssertBufferEqual(*buffer, "some data"); @@ -338,6 +350,7 @@ TEST_F(TestDevice, View) { ASSERT_EQ(buffer->device(), my_view_device_); ASSERT_FALSE(buffer->is_cpu()); ASSERT_EQ(buffer->address(), cpu_src_->address()); + ASSERT_EQ(buffer->device_type(), kMyDeviceType); #ifdef NDEBUG ASSERT_EQ(buffer->data(), nullptr); #endif @@ -348,6 +361,7 @@ TEST_F(TestDevice, View) { ASSERT_EQ(buffer->device(), cpu_device_); ASSERT_TRUE(buffer->is_cpu()); ASSERT_EQ(buffer->address(), my_copy_src_->address()); + ASSERT_EQ(buffer->device_type(), DeviceAllocationType::kCPU); ASSERT_NE(buffer->data(), nullptr); AssertBufferEqual(*buffer, "some data"); diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc index 85a5156d11db2..13355dd6d05ae 100644 --- a/cpp/src/arrow/c/bridge.cc +++ b/cpp/src/arrow/c/bridge.cc @@ -522,6 +522,8 @@ struct ExportedArrayPrivateData : PoolAllocationMixin std::shared_ptr data_; + RawSyncEvent sync_event_; + ExportedArrayPrivateData() = default; ARROW_DEFAULT_MOVE_AND_ASSIGN(ExportedArrayPrivateData); ARROW_DISALLOW_COPY_AND_ASSIGN(ExportedArrayPrivateData); @@ -544,7 +546,12 @@ void ReleaseExportedArray(struct ArrowArray* array) { << "Dictionary release callback should have marked it released"; } DCHECK_NE(array->private_data, nullptr); - delete reinterpret_cast(array->private_data); + auto* pdata = reinterpret_cast(array->private_data); + if (pdata->sync_event_.sync_event != nullptr && + pdata->sync_event_.release_func != nullptr) { + pdata->sync_event_.release_func(pdata->sync_event_.sync_event); + } + delete pdata; ArrowArrayMarkReleased(array); } @@ -584,6 +591,7 @@ struct ArrayExporter { // Store owning pointer to ArrayData export_.data_ = data; + export_.sync_event_ = RawSyncEvent(); return Status::OK(); } @@ -663,6 +671,118 @@ Status ExportRecordBatch(const RecordBatch& batch, struct ArrowArray* out, return Status::OK(); } +////////////////////////////////////////////////////////////////////////// +// C device arrays + +Status ValidateDeviceInfo(const ArrayData& data, + std::optional* device_type, + int64_t* device_id) { + for (const auto& buf : data.buffers) { + if (!buf) { + continue; + } + + if (*device_type == std::nullopt) { + *device_type = buf->device_type(); + *device_id = buf->device()->device_id(); + continue; + } + + if (buf->device_type() != *device_type) { + return Status::Invalid( + "Exporting device array with buffers on more than one device."); + } + + if (buf->device()->device_id() != *device_id) { + return Status::Invalid( + "Exporting device array with buffers on multiple device ids."); + } + } + + for (const auto& child : data.child_data) { + RETURN_NOT_OK(ValidateDeviceInfo(*child, device_type, device_id)); + } + + return Status::OK(); +} + +Result, int64_t>> ValidateDeviceInfo( + const ArrayData& data) { + std::optional device_type; + int64_t device_id = -1; + RETURN_NOT_OK(ValidateDeviceInfo(data, &device_type, &device_id)); + return std::make_pair(device_type, device_id); +} + +Status ExportDeviceArray(const Array& array, RawSyncEvent sync_event, + struct ArrowDeviceArray* out, struct ArrowSchema* out_schema) { + if (sync_event.sync_event != nullptr && sync_event.release_func) { + return Status::Invalid( + "Must provide a release event function if providing a non-null event"); + } + + SchemaExportGuard guard(out_schema); + if (out_schema != nullptr) { + RETURN_NOT_OK(ExportType(*array.type(), out_schema)); + } + + ARROW_ASSIGN_OR_RAISE(auto device_info, ValidateDeviceInfo(*array.data())); + if (!device_info.first) { + out->device_type = ARROW_DEVICE_CPU; + } else { + out->device_type = static_cast(*device_info.first); + } + out->device_id = device_info.second; + + ArrayExporter exporter; + RETURN_NOT_OK(exporter.Export(array.data())); + exporter.Finish(&out->array); + + auto* pdata = reinterpret_cast(out->array.private_data); + pdata->sync_event_ = sync_event; + out->sync_event = sync_event.sync_event; + + guard.Detach(); + return Status::OK(); +} + +Status ExportDeviceRecordBatch(const RecordBatch& batch, RawSyncEvent sync_event, + struct ArrowDeviceArray* out, + struct ArrowSchema* out_schema) { + if (sync_event.sync_event != nullptr && sync_event.release_func == nullptr) { + return Status::Invalid( + "Must provide a release event function if providing a non-null event"); + } + + // XXX perhaps bypass ToStructArray for speed? + ARROW_ASSIGN_OR_RAISE(auto array, batch.ToStructArray()); + + SchemaExportGuard guard(out_schema); + if (out_schema != nullptr) { + // Export the schema, not the struct type, so as not to lose top-level metadata + RETURN_NOT_OK(ExportSchema(*batch.schema(), out_schema)); + } + + ARROW_ASSIGN_OR_RAISE(auto device_info, ValidateDeviceInfo(*array->data())); + if (!device_info.first) { + out->device_type = ARROW_DEVICE_CPU; + } else { + out->device_type = static_cast(*device_info.first); + } + out->device_id = device_info.second; + + ArrayExporter exporter; + RETURN_NOT_OK(exporter.Export(array->data())); + exporter.Finish(&out->array); + + auto* pdata = reinterpret_cast(out->array.private_data); + pdata->sync_event_ = sync_event; + out->sync_event = sync_event.sync_event; + + guard.Detach(); + return Status::OK(); +} + ////////////////////////////////////////////////////////////////////////// // C schema import @@ -1242,6 +1362,7 @@ namespace { // The ArrowArray is released on destruction. struct ImportedArrayData { struct ArrowArray array_; + void* sync_event_; ImportedArrayData() { ArrowArrayMarkReleased(&array_); // Initially released @@ -1267,6 +1388,11 @@ class ImportedBuffer : public Buffer { std::shared_ptr import) : Buffer(data, size), import_(std::move(import)) {} + ImportedBuffer(const uint8_t* data, int64_t size, std::shared_ptr mm, + DeviceAllocationType device_type, + std::shared_ptr import) + : Buffer(data, size, mm, nullptr, device_type), import_(std::move(import)) {} + ~ImportedBuffer() override {} protected: @@ -1275,7 +1401,20 @@ class ImportedBuffer : public Buffer { struct ArrayImporter { explicit ArrayImporter(const std::shared_ptr& type) - : type_(type), zero_size_buffer_(std::make_shared(kZeroSizeArea, 0)) {} + : type_(type), + zero_size_buffer_(std::make_shared(kZeroSizeArea, 0)), + device_type_(DeviceAllocationType::kCPU) {} + + Status Import(struct ArrowDeviceArray* src, const DeviceMemoryMapper& mapper) { + ARROW_ASSIGN_OR_RAISE(memory_mgr_, mapper(src->device_type, src->device_id)); + device_type_ = static_cast(src->device_type); + RETURN_NOT_OK(Import(&src->array)); + import_->sync_event_ = src->sync_event; + // reset internal state before next import + memory_mgr_.reset(); + device_type_ = DeviceAllocationType::kCPU; + return Status::OK(); + } Status Import(struct ArrowArray* src) { if (ArrowArrayIsReleased(src)) { @@ -1588,7 +1727,12 @@ struct ArrayImporter { std::shared_ptr* out = &data_->buffers[buffer_id]; auto data = reinterpret_cast(c_struct_->buffers[buffer_id]); if (data != nullptr) { - *out = std::make_shared(data, buffer_size, import_); + if (memory_mgr_) { + *out = std::make_shared(data, buffer_size, memory_mgr_, + device_type_, import_); + } else { + *out = std::make_shared(data, buffer_size, import_); + } } else if (is_null_bitmap) { out->reset(); } else { @@ -1613,6 +1757,9 @@ struct ArrayImporter { // For imported null buffer pointers std::shared_ptr zero_size_buffer_; + + std::shared_ptr memory_mgr_; + DeviceAllocationType device_type_; }; } // namespace @@ -1652,6 +1799,45 @@ Result> ImportRecordBatch(struct ArrowArray* array, return ImportRecordBatch(array, *maybe_schema); } +Result> ImportDeviceArray(struct ArrowDeviceArray* array, + std::shared_ptr type, + const DeviceMemoryMapper& mapper) { + ArrayImporter importer(type); + RETURN_NOT_OK(importer.Import(array, mapper)); + return importer.MakeArray(); +} + +Result> ImportDeviceArray(struct ArrowDeviceArray* array, + struct ArrowSchema* type, + const DeviceMemoryMapper& mapper) { + auto maybe_type = ImportType(type); + if (!maybe_type.ok()) { + ArrowArrayRelease(&array->array); + return maybe_type.status(); + } + return ImportDeviceArray(array, *maybe_type, mapper); +} + +Result> ImportDeviceRecordBatch( + struct ArrowDeviceArray* array, std::shared_ptr schema, + const DeviceMemoryMapper& mapper) { + auto type = struct_(schema->fields()); + ArrayImporter importer(type); + RETURN_NOT_OK(importer.Import(array, mapper)); + return importer.MakeRecordBatch(std::move(schema)); +} + +Result> ImportDeviceRecordBatch( + struct ArrowDeviceArray* array, struct ArrowSchema* schema, + const DeviceMemoryMapper& mapper) { + auto maybe_schema = ImportSchema(schema); + if (!maybe_schema.ok()) { + ArrowArrayRelease(&array->array); + return maybe_schema.status(); + } + return ImportDeviceRecordBatch(array, *maybe_schema, mapper); +} + ////////////////////////////////////////////////////////////////////////// // C stream export diff --git a/cpp/src/arrow/c/bridge.h b/cpp/src/arrow/c/bridge.h index 3b1a013d20dbf..92707a59729fc 100644 --- a/cpp/src/arrow/c/bridge.h +++ b/cpp/src/arrow/c/bridge.h @@ -17,6 +17,7 @@ #pragma once +#include #include #include @@ -166,6 +167,135 @@ Result> ImportRecordBatch(struct ArrowArray* array, /// @} +/// \defgroup c-data-device-interface Functions for working with the C data device +/// interface. +/// +/// @{ + +/// \brief EXPERIMENTAL: Type for freeing a sync event +/// +/// If synchronization is necessary for accessing the data on a device, +/// a pointer to an event needs to be passed when exporting the device +/// array. It's the responsibility of the release function for the array +/// to release the event. Both can be null if no sync'ing is necessary. +struct RawSyncEvent { + void* sync_event = NULL; + std::function release_func; +}; + +/// \brief EXPERIMENTAL: Export C++ Array as an ArrowDeviceArray. +/// +/// The resulting ArrowDeviceArray struct keeps the array data and buffers alive +/// until its release callback is called by the consumer. All buffers in +/// the provided array MUST have the same device_type, otherwise an error +/// will be returned. +/// +/// If a non-null sync_event is provided, then the sync_release func must also be +/// non-null. If the sync_event is null, then the sync_release parameter is not called. +/// +/// \param[in] array Array object to export +/// \param[in] sync_event A struct containing what is needed for syncing if necessary +/// \param[out] out C struct to export the array to +/// \param[out] out_schema optional C struct to export the array type to +ARROW_EXPORT +Status ExportDeviceArray(const Array& array, RawSyncEvent sync_event, + struct ArrowDeviceArray* out, + struct ArrowSchema* out_schema = NULLPTR); + +/// \brief EXPERIMENTAL: Export C++ RecordBatch as an ArrowDeviceArray. +/// +/// The record batch is exported as if it were a struct array. +/// The resulting ArrowDeviceArray struct keeps the record batch data and buffers alive +/// until its release callback is called by the consumer. +/// +/// All buffers of all columns in the record batch must have the same device_type +/// otherwise an error will be returned. If columns are on different devices, +/// they should be exported using different ArrowDeviceArray instances. +/// +/// If a non-null sync_event is provided, then the sync_release func must also be +/// non-null. If the sync_event is null, then the sync_release parameter is ignored. +/// +/// \param[in] batch Record batch to export +/// \param[in] sync_event A struct containing what is needed for syncing if necessary +/// \param[out] out C struct where to export the record batch +/// \param[out] out_schema optional C struct where to export the record batch schema +ARROW_EXPORT +Status ExportDeviceRecordBatch(const RecordBatch& batch, RawSyncEvent sync_event, + struct ArrowDeviceArray* out, + struct ArrowSchema* out_schema = NULLPTR); + +using DeviceMemoryMapper = + std::function>(ArrowDeviceType, int64_t)>; + +/// \brief EXPERIMENTAL: Import C++ device array from the C data interface. +/// +/// The ArrowArray struct has its contents moved (as per the C data interface +/// specification) to a private object held alive by the resulting array. The +/// buffers of the Array are located on the device indicated by the device_type. +/// +/// \param[in,out] array C data interface struct holding the array data +/// \param[in] type type of the imported array +/// \param[in] mapper A function to map device + id to memory manager +/// \return Imported array object +ARROW_EXPORT +Result> ImportDeviceArray(struct ArrowDeviceArray* array, + std::shared_ptr type, + const DeviceMemoryMapper& mapper); + +/// \brief EXPERIMENTAL: Import C++ device array and its type from the C data interface. +/// +/// The ArrowArray struct has its contents moved (as per the C data interface +/// specification) to a private object held alive by the resulting array. +/// The ArrowSchema struct is released, even if this function fails. The +/// buffers of the Array are located on the device indicated by the device_type. +/// +/// \param[in,out] array C data interface struct holding the array data +/// \param[in,out] type C data interface struct holding the array type +/// \param[in] mapper A function to map device + id to memory manager +/// \return Imported array object +ARROW_EXPORT +Result> ImportDeviceArray(struct ArrowDeviceArray* array, + struct ArrowSchema* type, + const DeviceMemoryMapper& mapper); + +/// \brief EXPERIMENTAL: Import C++ record batch with buffers on a device from the C data +/// interface. +/// +/// The ArrowArray struct has its contents moved (as per the C data interface +/// specification) to a private object held alive by the resulting record batch. +/// The buffers of all columns of the record batch are located on the device +/// indicated by the device type. +/// +/// \param[in,out] array C data interface struct holding the record batch data +/// \param[in] schema schema of the imported record batch +/// \param[in] mapper A function to map device + id to memory manager +/// \return Imported record batch object +ARROW_EXPORT +Result> ImportDeviceRecordBatch( + struct ArrowDeviceArray* array, std::shared_ptr schema, + const DeviceMemoryMapper& mapper); + +/// \brief EXPERIMENTAL: Import C++ record batch with buffers on a device and its schema +/// from the C data interface. +/// +/// The type represented by the ArrowSchema struct must be a struct type array. +/// The ArrowArray struct has its contents moved (as per the C data interface +/// specification) to a private object held alive by the resulting record batch. +/// The ArrowSchema struct is released, even if this function fails. The buffers +/// of all columns of the record batch are located on the device indicated by the +/// device type. +/// +/// \param[in,out] array C data interface struct holding the record batch data +/// \param[in,out] schema C data interface struct holding the record batch schema +/// \param[in] mapper A function to map device + id to memory manager +/// \return Imported record batch object +ARROW_EXPORT +Result> ImportDeviceRecordBatch( + struct ArrowDeviceArray* array, struct ArrowSchema* schema, + const DeviceMemoryMapper& mapper); + +/// @} + /// \defgroup c-stream-interface Functions for working with the C data interface. /// /// @{ diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index 5fe7b653c8970..5c7de8e4a0783 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -565,6 +565,15 @@ struct ArrayExportChecker { ASSERT_EQ(c_export->children, nullptr); } } + + void operator()(struct ArrowDeviceArray* c_export, const ArrayData& expected_data, + const ArrowDeviceType device_type, const int64_t device_id, + const void* sync_event) { + ASSERT_EQ(c_export->device_type, device_type); + ASSERT_EQ(c_export->device_id, device_id); + ASSERT_EQ(c_export->sync_event, sync_event); + this->operator()(&c_export->array, expected_data); + } }; struct RecordBatchExportChecker { @@ -592,6 +601,15 @@ struct RecordBatchExportChecker { ASSERT_EQ(c_export->children, nullptr); } } + + void operator()(struct ArrowDeviceArray* c_export, const RecordBatch& expected_data, + const ArrowDeviceType device_type, const int64_t device_id, + const void* sync_event) { + ASSERT_EQ(c_export->device_type, device_type); + ASSERT_EQ(c_export->device_id, device_id); + ASSERT_EQ(c_export->sync_event, sync_event); + this->operator()(&c_export->array, expected_data); + } }; class TestArrayExport : public ::testing::Test { @@ -1112,6 +1130,392 @@ TEST_F(TestArrayExport, ExportRecordBatch) { } } +//////////////////////////////////////////////////////////////////////////// +// Device Array Export Tests + +static const char kMyDeviceTypeName[] = "arrowtest::MyDevice"; +static const ArrowDeviceType kMyDeviceType = ARROW_DEVICE_EXT_DEV; + +class MyBuffer final : public MutableBuffer { + public: + using MutableBuffer::MutableBuffer; + + ~MyBuffer() { default_memory_pool()->Free(const_cast(data_), size_); } +}; + +class MyMemoryManager : public CPUMemoryManager { + public: + explicit MyMemoryManager(const std::shared_ptr& device) + : CPUMemoryManager(device, default_memory_pool()) {} + + Result> AllocateBuffer(int64_t size) override { + uint8_t* data; + RETURN_NOT_OK(pool_->Allocate(size, &data)); + return std::make_unique(data, size, shared_from_this()); + } + + protected: + Result> CopyBufferFrom( + const std::shared_ptr& buf, + const std::shared_ptr& from) override { + return CopyNonOwnedFrom(*buf, from); + } + Result> CopyNonOwnedFrom( + const Buffer& buf, const std::shared_ptr& from) override { + if (!from->is_cpu()) { + return nullptr; + } + + ARROW_ASSIGN_OR_RAISE(auto dest, AllocateBuffer(buf.size())); + if (buf.size() > 0) { + memcpy(dest->mutable_data(), buf.data(), static_cast(buf.size())); + } + return std::move(dest); + } +}; + +class MyDevice : public Device { + public: + explicit MyDevice(int value) : Device(true), value_(value) {} + const char* type_name() const override { return kMyDeviceTypeName; } + std::string ToString() const override { return kMyDeviceTypeName; } + bool Equals(const Device& other) const override { + if (other.type_name() != kMyDeviceTypeName || other.device_type() != device_type()) { + return false; + } + return checked_cast(other).value_ == value_; + } + DeviceAllocationType device_type() const override { + return static_cast(kMyDeviceType); + } + int64_t device_id() const override { return value_; } + std::shared_ptr default_memory_manager() override { + return std::make_shared(shared_from_this()); + } + + protected: + int value_; +}; + +class TestDeviceArrayExport : public ::testing::Test { + public: + void SetUp() override { pool_ = default_memory_pool(); } + + static Result> ToDeviceData( + const std::shared_ptr& mm, const ArrayData& data) { + arrow::BufferVector buffers; + for (const auto& buf : data.buffers) { + if (buf) { + ARROW_ASSIGN_OR_RAISE(auto dest, mm->CopyBuffer(buf, mm)); + buffers.push_back(dest); + } else { + buffers.push_back(nullptr); + } + } + + arrow::ArrayDataVector children; + for (const auto& child : data.child_data) { + ARROW_ASSIGN_OR_RAISE(auto dest, ToDeviceData(mm, *child)); + children.push_back(dest); + } + + return ArrayData::Make(data.type, data.length, buffers, children, data.null_count, + data.offset); + } + + static Result> ToDevice(const std::shared_ptr& mm, + const ArrayData& data) { + ARROW_ASSIGN_OR_RAISE(auto result, ToDeviceData(mm, data)); + return MakeArray(result); + } + + template + static std::function>()> ToDeviceFactory( + const std::shared_ptr& mm, ArrayFactory&& factory) { + return [&]() { return ToDevice(mm, *factory()->data()); }; + } + + static std::function>()> JSONArrayFactory( + const std::shared_ptr& mm, std::shared_ptr type, + const char* json) { + return [=]() { return ToDevice(mm, *ArrayFromJSON(type, json)->data()); }; + } + + template + void TestWithArrayFactory(ArrayFactory&& factory, ExportCheckFunc&& check_func) { + auto orig_bytes = pool_->bytes_allocated(); + + std::shared_ptr arr; + ASSERT_OK_AND_ASSIGN(arr, ToResult(factory())); + ARROW_SCOPED_TRACE("type = ", arr->type()->ToString(), + ", array data = ", arr->ToString()); + const ArrayData& data = *arr->data(); // non-owning reference + struct ArrowDeviceArray c_export; + ASSERT_OK(ExportDeviceArray(*arr, {nullptr, nullptr}, &c_export)); + + ArrayExportGuard guard(&c_export.array); + auto new_bytes = pool_->bytes_allocated(); + ASSERT_GT(new_bytes, orig_bytes); + + // Release the shared_ptr, underlying data should be held alive + arr.reset(); + ASSERT_EQ(pool_->bytes_allocated(), new_bytes); + check_func(&c_export, data, kMyDeviceType, 1, nullptr); + + // Release the ArrowArray, underlying data should be destroyed + guard.Release(); + ASSERT_EQ(pool_->bytes_allocated(), orig_bytes); + } + + template + void TestNested(ArrayFactory&& factory) { + ArrayExportChecker checker; + TestWithArrayFactory(std::forward(factory), checker); + } + + void TestNested(const std::shared_ptr& mm, + const std::shared_ptr& type, const char* json) { + TestNested(JSONArrayFactory(mm, type, json)); + } + + template + void TestPrimitive(ArrayFactory&& factory) { + TestNested(std::forward(factory)); + } + + void TestPrimitive(const std::shared_ptr& mm, + const std::shared_ptr& type, const char* json) { + TestNested(mm, type, json); + } + + protected: + MemoryPool* pool_; +}; + +TEST_F(TestDeviceArrayExport, Primitive) { + std::shared_ptr device = std::make_shared(1); + auto mm = device->default_memory_manager(); + + TestPrimitive(mm, int8(), "[1, 2, null, -3]"); + TestPrimitive(mm, int16(), "[1, 2, -3]"); + TestPrimitive(mm, int32(), "[1, 2, null, -3]"); + TestPrimitive(mm, int64(), "[1, 2, -3]"); + TestPrimitive(mm, uint8(), "[1, 2, 3]"); + TestPrimitive(mm, uint16(), "[1, 2, null, 3]"); + TestPrimitive(mm, uint32(), "[1, 2, 3]"); + TestPrimitive(mm, uint64(), "[1, 2, null, 3]"); + + TestPrimitive(mm, boolean(), "[true, false, null]"); + + TestPrimitive(mm, float32(), "[1.5, null]"); + TestPrimitive(mm, float64(), "[1.5, null]"); + + TestPrimitive(mm, fixed_size_binary(3), R"(["foo", "bar", null])"); + TestPrimitive(mm, binary(), R"(["foo", "bar", null])"); + TestPrimitive(mm, large_binary(), R"(["foo", "bar", null])"); + TestPrimitive(mm, utf8(), R"(["foo", "bar", null])"); + TestPrimitive(mm, large_utf8(), R"(["foo", "bar", null])"); + + TestPrimitive(mm, decimal(16, 4), R"(["1234.5670", null])"); + TestPrimitive(mm, decimal256(16, 4), R"(["1234.5670", null])"); + + TestPrimitive(mm, month_day_nano_interval(), R"([[-1, 5, 20], null])"); +} + +TEST_F(TestDeviceArrayExport, PrimitiveSliced) { + std::shared_ptr device = std::make_shared(1); + auto mm = device->default_memory_manager(); + + auto factory = [=]() { + return (*ToDevice(mm, *ArrayFromJSON(int16(), "[1, 2, null, -3]")->data())) + ->Slice(1, 2); + }; + TestPrimitive(factory); +} + +TEST_F(TestDeviceArrayExport, Temporal) { + std::shared_ptr device = std::make_shared(1); + auto mm = device->default_memory_manager(); + + const char* json = "[1, 2, null, 42]"; + TestPrimitive(mm, date32(), json); + TestPrimitive(mm, date64(), json); + TestPrimitive(mm, time32(TimeUnit::SECOND), json); + TestPrimitive(mm, time32(TimeUnit::MILLI), json); + TestPrimitive(mm, time64(TimeUnit::MICRO), json); + TestPrimitive(mm, time64(TimeUnit::NANO), json); + TestPrimitive(mm, duration(TimeUnit::SECOND), json); + TestPrimitive(mm, duration(TimeUnit::MILLI), json); + TestPrimitive(mm, duration(TimeUnit::MICRO), json); + TestPrimitive(mm, duration(TimeUnit::NANO), json); + TestPrimitive(mm, month_interval(), json); + + TestPrimitive(mm, day_time_interval(), "[[7, 600], null]"); + + json = R"(["1970-01-01","2000-02-29","1900-02-28"])"; + TestPrimitive(mm, timestamp(TimeUnit::SECOND), json); + TestPrimitive(mm, timestamp(TimeUnit::SECOND, "Europe/Paris"), json); + TestPrimitive(mm, timestamp(TimeUnit::MILLI), json); + TestPrimitive(mm, timestamp(TimeUnit::MILLI, "Europe/Paris"), json); + TestPrimitive(mm, timestamp(TimeUnit::MICRO), json); + TestPrimitive(mm, timestamp(TimeUnit::MICRO, "Europe/Paris"), json); + TestPrimitive(mm, timestamp(TimeUnit::NANO), json); + TestPrimitive(mm, timestamp(TimeUnit::NANO, "Europe/Paris"), json); +} + +TEST_F(TestDeviceArrayExport, List) { + std::shared_ptr device = std::make_shared(1); + auto mm = device->default_memory_manager(); + + TestNested(mm, list(int8()), "[[1, 2], [3, null], null]"); + TestNested(mm, large_list(uint16()), "[[1, 2], [3, null], null]"); + TestNested(mm, fixed_size_list(int64(), 2), "[[1, 2], [3, null], null]"); + + TestNested(mm, list(large_list(int32())), "[[[1, 2], [3], null], null]"); +} + +TEST_F(TestDeviceArrayExport, ListSliced) { + std::shared_ptr device = std::make_shared(1); + auto mm = device->default_memory_manager(); + + { + auto factory = [=]() { + return (*ToDevice( + mm, *ArrayFromJSON(list(int8()), "[[1, 2], [3, null], [4, 5, 6], null]") + ->data())) + ->Slice(1, 2); + }; + TestNested(factory); + } + { + auto factory = [=]() { + auto values = + (*ToDevice(mm, + *ArrayFromJSON(int16(), "[1, 2, 3, 4, null, 5, 6, 7, 8]")->data())) + ->Slice(1, 6); + auto offsets = (*ToDevice(mm, *ArrayFromJSON(int32(), "[0, 2, 3, 5, 6]")->data())) + ->Slice(2, 4); + return ListArray::FromArrays(*offsets, *values); + }; + TestNested(factory); + } +} + +TEST_F(TestDeviceArrayExport, Struct) { + std::shared_ptr device = std::make_shared(1); + auto mm = device->default_memory_manager(); + + const char* data = R"([[1, "foo"], [2, null]])"; + auto type = struct_({field("a", int8()), field("b", utf8())}); + TestNested(mm, type, data); +} + +TEST_F(TestDeviceArrayExport, Map) { + std::shared_ptr device = std::make_shared(1); + auto mm = device->default_memory_manager(); + + const char* json = R"([[[1, "foo"], [2, null]], [[3, "bar"]]])"; + TestNested(mm, map(int8(), utf8()), json); + TestNested(mm, map(int8(), utf8(), /*keys_sorted=*/true), json); +} + +TEST_F(TestDeviceArrayExport, Union) { + std::shared_ptr device = std::make_shared(1); + auto mm = device->default_memory_manager(); + + const char* data = "[null, [42, 1], [43, true], [42, null], [42, 2]]"; + // Dense + auto field_a = field("a", int8()); + auto field_b = field("b", boolean(), /*nullable=*/false); + auto type = dense_union({field_a, field_b}, {42, 43}); + TestNested(mm, type, data); + // Sparse + field_a = field("a", int8(), /*nullable=*/false); + field_b = field("b", boolean()); + type = sparse_union({field_a, field_b}, {42, 43}); + TestNested(mm, type, data); +} + +TEST_F(TestDeviceArrayExport, Extension) { + std::shared_ptr device = std::make_shared(1); + auto mm = device->default_memory_manager(); + + TestPrimitive(ToDeviceFactory(mm, ExampleUuid)); + TestPrimitive(ToDeviceFactory(mm, ExampleSmallint)); + TestPrimitive(ToDeviceFactory(mm, ExampleComplex128)); +} + +TEST_F(TestDeviceArrayExport, ExportArrayAndType) { + std::shared_ptr device = std::make_shared(1); + auto mm = device->default_memory_manager(); + + struct ArrowSchema c_schema {}; + struct ArrowDeviceArray c_array {}; + SchemaExportGuard schema_guard(&c_schema); + ArrayExportGuard array_guard(&c_array.array); + + auto array = ToDevice(mm, *ArrayFromJSON(int8(), "[1, 2, 3]")->data()).ValueOrDie(); + ASSERT_OK(ExportDeviceArray(*array, {nullptr, nullptr}, &c_array, &c_schema)); + const ArrayData& data = *array->data(); + array.reset(); + ASSERT_FALSE(ArrowSchemaIsReleased(&c_schema)); + ASSERT_FALSE(ArrowArrayIsReleased(&c_array.array)); + ASSERT_EQ(c_schema.format, std::string("c")); + ASSERT_EQ(c_schema.n_children, 0); + ArrayExportChecker checker{}; + checker(&c_array, data, kMyDeviceType, 1, nullptr); +} + +TEST_F(TestDeviceArrayExport, ExportRecordBatch) { + std::shared_ptr device = std::make_shared(1); + auto mm = device->default_memory_manager(); + + struct ArrowSchema c_schema {}; + struct ArrowDeviceArray c_array {}; + + auto schema = ::arrow::schema( + {field("ints", int16()), field("bools", boolean(), /*nullable=*/false)}); + schema = schema->WithMetadata(key_value_metadata(kMetadataKeys2, kMetadataValues2)); + auto arr0 = ToDevice(mm, *ArrayFromJSON(int16(), "[1, 2, null]")->data()).ValueOrDie(); + auto arr1 = ToDevice(mm, *ArrayFromJSON(boolean(), "[false, true, false]")->data()) + .ValueOrDie(); + + auto batch_factory = [&]() { return RecordBatch::Make(schema, 3, {arr0, arr1}); }; + + { + auto batch = batch_factory(); + + ASSERT_OK(ExportDeviceRecordBatch(*batch, {nullptr, nullptr}, &c_array, &c_schema)); + SchemaExportGuard schema_guard(&c_schema); + ArrayExportGuard array_guard(&c_array.array); + RecordBatchExportChecker checker{}; + checker(&c_array, *batch, kMyDeviceType, 1, nullptr); + + // create batch anew, with the same buffer pointers + batch = batch_factory(); + checker(&c_array, *batch, kMyDeviceType, 1, nullptr); + } + { + // Check one can export both schema and record batch at once + auto batch = batch_factory(); + + ASSERT_OK(ExportDeviceRecordBatch(*batch, {nullptr, nullptr}, &c_array, &c_schema)); + SchemaExportGuard schema_guard(&c_schema); + ArrayExportGuard array_guard(&c_array.array); + ASSERT_EQ(c_schema.format, std::string("+s")); + ASSERT_EQ(c_schema.n_children, 2); + ASSERT_NE(c_schema.metadata, nullptr); + ASSERT_EQ(kEncodedMetadata2, + std::string(c_schema.metadata, kEncodedMetadata2.size())); + RecordBatchExportChecker checker{}; + checker(&c_array, *batch, kMyDeviceType, 1, nullptr); + + // Create batch anew, with the same buffer pointers + batch = batch_factory(); + checker(&c_array, *batch, kMyDeviceType, 1, nullptr); + } +} + //////////////////////////////////////////////////////////////////////////// // Schema import tests diff --git a/cpp/src/arrow/device.h b/cpp/src/arrow/device.h index 67c62a5181f28..9cc68fe8c82ce 100644 --- a/cpp/src/arrow/device.h +++ b/cpp/src/arrow/device.h @@ -29,6 +29,24 @@ namespace arrow { +/// \brief EXPERIMENTAL: Device type enum which matches up with C Data Device types +enum class DeviceAllocationType : char { + kCPU = 1, + kCUDA = 2, + kCUDA_HOST = 3, + kOPENCL = 4, + kVULKAN = 7, + kMETAL = 8, + kVPI = 9, + kROCM = 10, + kROCM_HOST = 11, + kEXT_DEV = 12, + kCUDA_MANAGED = 13, + kONEAPI = 14, + kWEBGPU = 15, + kHEXAGON = 16, +}; + class MemoryManager; /// \brief EXPERIMENTAL: Abstract interface for hardware devices @@ -58,6 +76,12 @@ class ARROW_EXPORT Device : public std::enable_shared_from_this, /// \brief Whether this instance points to the same device as another one. virtual bool Equals(const Device&) const = 0; + /// \brief A device ID to identify this device if there are multiple of this type. + /// + /// If there is no "device_id" equivalent (such as for the main CPU device on + /// non-numa systems) returns -1. + virtual int64_t device_id() const { return -1; } + /// \brief Whether this device is the main CPU device. /// /// This shorthand method is very useful when deciding whether a memory address @@ -71,6 +95,9 @@ class ARROW_EXPORT Device : public std::enable_shared_from_this, /// MemoryManager instances with non-default parameters. virtual std::shared_ptr default_memory_manager() = 0; + /// \brief Return the DeviceAllocationType of this device + virtual DeviceAllocationType device_type() const = 0; + protected: ARROW_DISALLOW_COPY_AND_ASSIGN(Device); explicit Device(bool is_cpu = false) : is_cpu_(is_cpu) {} @@ -172,6 +199,7 @@ class ARROW_EXPORT CPUDevice : public Device { const char* type_name() const override; std::string ToString() const override; bool Equals(const Device&) const override; + DeviceAllocationType device_type() const override { return DeviceAllocationType::kCPU; } std::shared_ptr default_memory_manager() override; diff --git a/cpp/src/arrow/gpu/cuda_context.cc b/cpp/src/arrow/gpu/cuda_context.cc index f754c07d13c89..869ea6453ccda 100644 --- a/cpp/src/arrow/gpu/cuda_context.cc +++ b/cpp/src/arrow/gpu/cuda_context.cc @@ -384,7 +384,8 @@ Result> CudaMemoryManager::ViewBufferTo( if (to->is_cpu()) { // Device-on-CPU view ARROW_ASSIGN_OR_RAISE(auto address, GetHostAddress(buf->address())); - return std::make_shared(address, buf->size(), to, buf); + return std::make_shared(address, buf->size(), to, buf, + DeviceAllocationType::kCUDA_HOST); } return nullptr; } diff --git a/cpp/src/arrow/gpu/cuda_context.h b/cpp/src/arrow/gpu/cuda_context.h index 0115ed19a103d..a1b95c7b4181d 100644 --- a/cpp/src/arrow/gpu/cuda_context.h +++ b/cpp/src/arrow/gpu/cuda_context.h @@ -92,6 +92,10 @@ class ARROW_EXPORT CudaDevice : public Device { std::string ToString() const override; bool Equals(const Device&) const override; std::shared_ptr default_memory_manager() override; + DeviceAllocationType device_type() const override { + return DeviceAllocationType::kCUDA; + } + int64_t device_id() const override { return device_number(); } /// \brief Return a CudaDevice instance for a particular device /// \param[in] device_number the CUDA device number diff --git a/cpp/src/arrow/gpu/cuda_memory.cc b/cpp/src/arrow/gpu/cuda_memory.cc index 297e4dcf71e44..860c6311d7b2f 100644 --- a/cpp/src/arrow/gpu/cuda_memory.cc +++ b/cpp/src/arrow/gpu/cuda_memory.cc @@ -198,6 +198,11 @@ Result> CudaBuffer::ExportForIpc() { return handle; } +CudaHostBuffer::CudaHostBuffer(uint8_t* data, const int64_t size) + : MutableBuffer(data, size) { + device_type_ = DeviceAllocationType::kCUDA_HOST; +} + CudaHostBuffer::~CudaHostBuffer() { auto maybe_manager = CudaDeviceManager::Instance(); ARROW_CHECK_OK(maybe_manager.status()); @@ -480,5 +485,21 @@ Result GetHostAddress(uintptr_t device_ptr) { return static_cast(ptr); } +Result> DefaultMemoryMapper(ArrowDeviceType device_type, + int64_t device_id) { + switch (device_type) { + case ARROW_DEVICE_CPU: + return default_cpu_memory_manager(); + case ARROW_DEVICE_CUDA: + case ARROW_DEVICE_CUDA_HOST: + case ARROW_DEVICE_CUDA_MANAGED: { + ARROW_ASSIGN_OR_RAISE(auto device, arrow::cuda::CudaDevice::Make(device_id)); + return device->default_memory_manager(); + } + default: + return Status::NotImplemented("memory manager not implemented for device"); + } +} + } // namespace cuda } // namespace arrow diff --git a/cpp/src/arrow/gpu/cuda_memory.h b/cpp/src/arrow/gpu/cuda_memory.h index 18c23a507805a..d323bef03494e 100644 --- a/cpp/src/arrow/gpu/cuda_memory.h +++ b/cpp/src/arrow/gpu/cuda_memory.h @@ -21,6 +21,7 @@ #include #include "arrow/buffer.h" +#include "arrow/c/abi.h" #include "arrow/io/concurrency.h" #include "arrow/type_fwd.h" @@ -110,7 +111,8 @@ class ARROW_EXPORT CudaBuffer : public Buffer { /// \brief Device-accessible CPU memory created using cudaHostAlloc class ARROW_EXPORT CudaHostBuffer : public MutableBuffer { public: - using MutableBuffer::MutableBuffer; + CudaHostBuffer(uint8_t* data, const int64_t size); + ~CudaHostBuffer(); /// \brief Return a device address the GPU can read this memory from. @@ -258,5 +260,9 @@ Result GetDeviceAddress(const uint8_t* cpu_data, ARROW_EXPORT Result GetHostAddress(uintptr_t device_ptr); +ARROW_EXPORT +Result> DefaultMemoryMapper(ArrowDeviceType device_type, + int64_t device_id); + } // namespace cuda } // namespace arrow diff --git a/cpp/src/arrow/gpu/cuda_test.cc b/cpp/src/arrow/gpu/cuda_test.cc index aac45d13831e5..6d392213e231f 100644 --- a/cpp/src/arrow/gpu/cuda_test.cc +++ b/cpp/src/arrow/gpu/cuda_test.cc @@ -364,6 +364,7 @@ TEST_F(TestCudaHostBuffer, AllocateGlobal) { ASSERT_TRUE(host_buffer->is_cpu()); ASSERT_EQ(host_buffer->memory_manager(), cpu_mm_); + ASSERT_EQ(host_buffer->device_type(), DeviceAllocationType::kCUDA_HOST); ASSERT_OK_AND_ASSIGN(auto device_address, host_buffer->GetDeviceAddress(context_)); ASSERT_NE(device_address, 0); @@ -376,6 +377,7 @@ TEST_F(TestCudaHostBuffer, ViewOnDevice) { ASSERT_TRUE(host_buffer->is_cpu()); ASSERT_EQ(host_buffer->memory_manager(), cpu_mm_); + ASSERT_EQ(host_buffer->device_type(), DeviceAllocationType::kCUDA_HOST); // Try to view the host buffer on the device. This should correspond to // GetDeviceAddress() in the previous test. @@ -385,6 +387,7 @@ TEST_F(TestCudaHostBuffer, ViewOnDevice) { ASSERT_NE(device_buffer->address(), 0); ASSERT_EQ(device_buffer->size(), host_buffer->size()); ASSERT_EQ(device_buffer->parent(), host_buffer); + ASSERT_EQ(device_buffer->device_type(), DeviceAllocationType::kCUDA); // View back the device buffer on the CPU. This should roundtrip. ASSERT_OK_AND_ASSIGN(auto buffer, Buffer::View(device_buffer, cpu_mm_)); @@ -393,6 +396,7 @@ TEST_F(TestCudaHostBuffer, ViewOnDevice) { ASSERT_EQ(buffer->address(), host_buffer->address()); ASSERT_EQ(buffer->size(), host_buffer->size()); ASSERT_EQ(buffer->parent(), device_buffer); + ASSERT_EQ(buffer->device_type(), DeviceAllocationType::kCUDA_HOST); } // ------------------------------------------------------------------------ From 724ba6cd2391a856b72636385c56d664f74b577e Mon Sep 17 00:00:00 2001 From: Thor <8681572+thorfour@users.noreply.github.com> Date: Tue, 25 Jul 2023 09:55:57 -0500 Subject: [PATCH 043/749] GH-36858: [Go] Fix dictionary builder leak (#36859) ### Rationale for this change This fixes a potential leak when using a memory allocator that can panic. ### What changes are included in this PR? This moves the `Retain` call to happen after the last potential allocation in `newWithDictOffset` to prevent a leak. ### Are these changes tested? A unit test was added that causes the builder to panic on the `GetDictArrayData` step. ### Are there any user-facing changes? * Closes: #36858 Authored-by: thorfour Signed-off-by: Matt Topol --- go/arrow/array/dictionary.go | 2 +- go/arrow/array/dictionary_test.go | 46 +++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/go/arrow/array/dictionary.go b/go/arrow/array/dictionary.go index 8c3ffb5247fe0..da1aea50b24f8 100644 --- a/go/arrow/array/dictionary.go +++ b/go/arrow/array/dictionary.go @@ -814,11 +814,11 @@ func (b *dictionaryBuilder) newWithDictOffset(offset int) (indices, dict *Data, defer idxarr.Release() indices = idxarr.Data().(*Data) - indices.Retain() b.deltaOffset = b.memoTable.Size() dict, err = GetDictArrayData(b.mem, b.dt.ValueType, b.memoTable, offset) b.reset() + indices.Retain() return } diff --git a/go/arrow/array/dictionary_test.go b/go/arrow/array/dictionary_test.go index cc252e26855db..8bb9edebf89bc 100644 --- a/go/arrow/array/dictionary_test.go +++ b/go/arrow/array/dictionary_test.go @@ -1800,3 +1800,49 @@ func TestDictionaryAppendIndices(t *testing.T) { }) } } + +type panicAllocator struct { + n int + paniced bool + memory.Allocator +} + +func (p *panicAllocator) Allocate(size int) []byte { + if size > p.n { + p.paniced = true + panic("panic allocator") + } + return p.Allocator.Allocate(size) +} + +func (p *panicAllocator) Reallocate(size int, b []byte) []byte { + return p.Allocator.Reallocate(size, b) +} + +func (p *panicAllocator) Free(b []byte) { + p.Allocator.Free(b) +} + +func TestBinaryDictionaryPanic(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.DefaultAllocator) + defer mem.AssertSize(t, 0) + + allocator := &panicAllocator{ + n: 400, + Allocator: mem, + } + + expectedType := &arrow.DictionaryType{IndexType: &arrow.Int8Type{}, ValueType: arrow.BinaryTypes.String} + bldr := array.NewDictionaryBuilder(allocator, expectedType) + defer bldr.Release() + + bldr.AppendNull() + allocator.n = 0 // force panic + func() { + defer func() { + recover() + }() + bldr.NewArray() + }() + assert.True(t, allocator.paniced) +} From 3ac880df2e0fc3ce364ca37073e1bb0779896d46 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Tue, 25 Jul 2023 09:37:21 -0700 Subject: [PATCH 044/749] GH-34213: [C++] Use recursive calls without a delimiter if the user is doing a recursive GetFileInfo (#35440) ### Rationale for this change The old model of "walk"ing the directory could lead to a large number of calls. If someone is fully listing a bucket they will need to make one S3 API call for every single directory in the bucket. With this approach there is only 1 call made for every 1000 files, regardless of how they are spread across directories. The only potential regression would be if max_recursion was set to something > 1. For example, if a user had: ``` bucket/foo/bar/<10000 files here> ``` Then if they make a request for `bucket` with `max_recursion=2` the new approach will list all 10,000 files and then eliminate the files that don't match. However, I believe these cases (using max_recursion) to be rarer and less common than the typical case of listing all files (which dataset discovery does). ### What changes are included in this PR? The algorithm behind GetFileInfo and DeleteDirContents in S3FileSystem has changed. ### Are these changes tested? Yes, there should be no behavior change. All of the existing filesystem tests will test this change. ### Are there any user-facing changes? No, other than (hopefully) better performance. * Closes: #34213 Lead-authored-by: Weston Pace Co-authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/arrow/filesystem/filesystem_test.cc | 41 + cpp/src/arrow/filesystem/path_util.cc | 37 + cpp/src/arrow/filesystem/path_util.h | 20 +- cpp/src/arrow/filesystem/s3fs.cc | 828 ++++++++++---------- cpp/src/arrow/filesystem/s3fs_test.cc | 86 +- cpp/src/arrow/util/async_util.cc | 6 +- cpp/src/arrow/util/async_util_test.cc | 23 + 7 files changed, 638 insertions(+), 403 deletions(-) diff --git a/cpp/src/arrow/filesystem/filesystem_test.cc b/cpp/src/arrow/filesystem/filesystem_test.cc index b450a69913135..c76c3d27e8f8e 100644 --- a/cpp/src/arrow/filesystem/filesystem_test.cc +++ b/cpp/src/arrow/filesystem/filesystem_test.cc @@ -87,6 +87,34 @@ TEST(PathUtil, SplitAbstractPath) { AssertPartsEqual(parts, {"abc", "def.ghi"}); } +TEST(PathUtil, SliceAbstractPath) { + std::string path = "abc"; + ASSERT_EQ("abc", SliceAbstractPath(path, 0, 1)); + ASSERT_EQ("abc", SliceAbstractPath(path, 0, 2)); + ASSERT_EQ("", SliceAbstractPath(path, 0, 0)); + ASSERT_EQ("", SliceAbstractPath(path, 1, 0)); + + path = "abc/def\\x/y.ext"; + ASSERT_EQ("abc/def\\x/y.ext", SliceAbstractPath(path, 0, 4)); + ASSERT_EQ("abc/def\\x/y.ext", SliceAbstractPath(path, 0, 3)); + ASSERT_EQ("abc/def\\x", SliceAbstractPath(path, 0, 2)); + ASSERT_EQ("abc", SliceAbstractPath(path, 0, 1)); + ASSERT_EQ("def\\x/y.ext", SliceAbstractPath(path, 1, 2)); + ASSERT_EQ("def\\x/y.ext", SliceAbstractPath(path, 1, 3)); + ASSERT_EQ("def\\x", SliceAbstractPath(path, 1, 1)); + ASSERT_EQ("y.ext", SliceAbstractPath(path, 2, 1)); + ASSERT_EQ("", SliceAbstractPath(path, 3, 1)); + + path = "x/y\\z"; + ASSERT_EQ("x", SliceAbstractPath(path, 0, 1)); + ASSERT_EQ("x/y", SliceAbstractPath(path, 0, 1, /*sep=*/'\\')); + + // Invalid cases but we shouldn't crash + ASSERT_EQ("", SliceAbstractPath(path, -1, 1)); + ASSERT_EQ("", SliceAbstractPath(path, 0, -1)); + ASSERT_EQ("", SliceAbstractPath(path, -1, -1)); +} + TEST(PathUtil, GetAbstractPathExtension) { ASSERT_EQ(GetAbstractPathExtension("abc.txt"), "txt"); ASSERT_EQ(GetAbstractPathExtension("dir/abc.txt"), "txt"); @@ -98,6 +126,19 @@ TEST(PathUtil, GetAbstractPathExtension) { ASSERT_EQ(GetAbstractPathExtension("/run.d/abc"), ""); } +TEST(PathUtil, GetAbstractPathDepth) { + ASSERT_EQ(0, GetAbstractPathDepth("")); + ASSERT_EQ(0, GetAbstractPathDepth("/")); + ASSERT_EQ(1, GetAbstractPathDepth("foo")); + ASSERT_EQ(1, GetAbstractPathDepth("foo/")); + ASSERT_EQ(1, GetAbstractPathDepth("/foo")); + ASSERT_EQ(1, GetAbstractPathDepth("/foo/")); + ASSERT_EQ(2, GetAbstractPathDepth("/foo/bar")); + ASSERT_EQ(2, GetAbstractPathDepth("/foo/bar/")); + ASSERT_EQ(2, GetAbstractPathDepth("foo/bar")); + ASSERT_EQ(2, GetAbstractPathDepth("foo/bar/")); +} + TEST(PathUtil, GetAbstractPathParent) { std::pair pair; diff --git a/cpp/src/arrow/filesystem/path_util.cc b/cpp/src/arrow/filesystem/path_util.cc index e25e544f0341f..90af3c66ff8d4 100644 --- a/cpp/src/arrow/filesystem/path_util.cc +++ b/cpp/src/arrow/filesystem/path_util.cc @@ -17,6 +17,7 @@ #include #include +#include #include "arrow/filesystem/path_util.h" #include "arrow/filesystem/util_internal.h" @@ -66,6 +67,42 @@ std::vector SplitAbstractPath(const std::string& path, char sep) { return parts; } +std::string SliceAbstractPath(const std::string& s, int offset, int length, char sep) { + if (offset < 0 || length < 0) { + return ""; + } + std::vector components = SplitAbstractPath(s, sep); + std::stringstream combined; + if (offset >= static_cast(components.size())) { + return ""; + } + int end = offset + length; + if (end > static_cast(components.size())) { + end = static_cast(components.size()); + } + for (int i = offset; i < end; i++) { + combined << components[i]; + if (i < end - 1) { + combined << sep; + } + } + return combined.str(); +} + +int GetAbstractPathDepth(std::string_view path) { + if (path.empty()) { + return 0; + } + int depth = static_cast(std::count(path.begin(), path.end(), kSep)) + 1; + if (path.back() == kSep) { + depth -= 1; + } + if (path.front() == kSep) { + depth -= 1; + } + return depth; +} + std::pair GetAbstractPathParent(const std::string& s) { // XXX should strip trailing slash? diff --git a/cpp/src/arrow/filesystem/path_util.h b/cpp/src/arrow/filesystem/path_util.h index b821e79338490..13a74b7fa12c8 100644 --- a/cpp/src/arrow/filesystem/path_util.h +++ b/cpp/src/arrow/filesystem/path_util.h @@ -38,9 +38,25 @@ constexpr char kSep = '/'; ARROW_EXPORT std::vector SplitAbstractPath(const std::string& path, char sep = kSep); -// Return the extension of the file +// Slice the individual components of an abstract path and combine them +// +// If offset or length are negative then an empty string is returned +// If offset is >= the number of components then an empty string is returned +// If offset + length is >= the number of components then length is truncated ARROW_EXPORT -std::string GetAbstractPathExtension(const std::string& s); +std::string SliceAbstractPath(const std::string& path, int offset, int length, + char sep = kSep); + +// Return the extension of the file +ARROW_EXPORT std::string GetAbstractPathExtension(const std::string& s); + +// Return the depth (number of components) of an abstract path +// +// Trailing slashes do not count towards depth +// Leading slashes do not count towards depth +// +// The root path ("/") has depth 0 +ARROW_EXPORT int GetAbstractPathDepth(std::string_view path); // Return the parent directory and basename of an abstract path. Both values may be // empty. diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index 29b45e1dc9abe..c67f7668ffa4d 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -29,6 +29,7 @@ #include #include #include +#include #include #ifdef _WIN32 @@ -59,6 +60,7 @@ #endif #include #include +#include #include #include #include @@ -91,6 +93,7 @@ #include "arrow/result.h" #include "arrow/status.h" #include "arrow/util/async_generator.h" +#include "arrow/util/async_util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/future.h" #include "arrow/util/io_util.h" @@ -734,8 +737,12 @@ class S3ClientLock { // with a shared mutex locked in shared mode. // The reason is that locking again in shared mode can block while // there are threads waiting to take the lock in exclusive mode. - // Therefore, we should avoid to keep the S3ClientLock taken - // before is it taken again. This methods helps doing that. + // Therefore, we should avoid obtaining the S3ClientLock when + // we already have it locked. + // + // This methods helps by moving the S3ClientLock into a temporary + // that is immediately destroyed so the lock will be released as + // soon as we are done making the call to the underlying client. // // (see GH-36523) S3ClientLock Move() { return std::move(*this); } @@ -1702,147 +1709,6 @@ void FileObjectToInfo(const S3Model::Object& obj, FileInfo* info) { info->set_mtime(FromAwsDatetime(obj.GetLastModified())); } -struct TreeWalker : public std::enable_shared_from_this { - using ResultHandler = std::function; - using ErrorHandler = std::function& error)>; - using RecursionHandler = std::function(int32_t nesting_depth)>; - - std::shared_ptr holder_; - io::IOContext io_context_; - const std::string bucket_; - const std::string base_dir_; - const int32_t max_keys_; - const ResultHandler result_handler_; - const ErrorHandler error_handler_; - const RecursionHandler recursion_handler_; - - template - static Status Walk(Args&&... args) { - return WalkAsync(std::forward(args)...).status(); - } - - template - static Future<> WalkAsync(Args&&... args) { - auto self = std::make_shared(std::forward(args)...); - return self->DoWalk(); - } - - TreeWalker(std::shared_ptr holder, io::IOContext io_context, - std::string bucket, std::string base_dir, int32_t max_keys, - ResultHandler result_handler, ErrorHandler error_handler, - RecursionHandler recursion_handler) - : holder_(std::move(holder)), - io_context_(io_context), - bucket_(std::move(bucket)), - base_dir_(std::move(base_dir)), - max_keys_(max_keys), - result_handler_(std::move(result_handler)), - error_handler_(std::move(error_handler)), - recursion_handler_(std::move(recursion_handler)) {} - - private: - std::shared_ptr task_group_; - std::mutex mutex_; - - Future<> DoWalk() { - task_group_ = - TaskGroup::MakeThreaded(io_context_.executor(), io_context_.stop_token()); - WalkChild(base_dir_, /*nesting_depth=*/0); - // When this returns, ListObjectsV2 tasks either have finished or will exit early - return task_group_->FinishAsync(); - } - - bool ok() const { return task_group_->ok(); } - - struct ListObjectsV2Handler { - std::shared_ptr walker; - std::string prefix; - int32_t nesting_depth; - S3Model::ListObjectsV2Request req; - - Status operator()(const Result& result) { - // Serialize calls to operation-specific handlers - if (!walker->ok()) { - // Early exit: avoid executing handlers if DoWalk() returned - return Status::OK(); - } - if (!result.ok()) { - return result.status(); - } - const auto& outcome = *result; - if (!outcome.IsSuccess()) { - { - std::lock_guard guard(walker->mutex_); - return walker->error_handler_(outcome.GetError()); - } - } - return HandleResult(outcome.GetResult()); - } - - void SpawnListObjectsV2() { - auto cb = *this; - walker->task_group_->Append([cb]() mutable { - ARROW_ASSIGN_OR_RAISE(auto client_lock, cb.walker->holder_->Lock()); - Result result = - client_lock.Move()->ListObjectsV2(cb.req); - return cb(std::move(result)); - }); - } - - Status HandleResult(const S3Model::ListObjectsV2Result& result) { - bool recurse; - { - // Only one thread should be running result_handler_/recursion_handler_ at a time - std::lock_guard guard(walker->mutex_); - recurse = result.GetCommonPrefixes().size() > 0; - if (recurse) { - ARROW_ASSIGN_OR_RAISE(auto maybe_recurse, - walker->recursion_handler_(nesting_depth + 1)); - recurse &= maybe_recurse; - } - RETURN_NOT_OK(walker->result_handler_(prefix, result)); - } - if (recurse) { - walker->WalkChildren(result, nesting_depth + 1); - } - // If the result was truncated, issue a continuation request to get - // further directory entries. - if (result.GetIsTruncated()) { - DCHECK(!result.GetNextContinuationToken().empty()); - req.SetContinuationToken(result.GetNextContinuationToken()); - SpawnListObjectsV2(); - } - return Status::OK(); - } - - void Start() { - req.SetBucket(ToAwsString(walker->bucket_)); - if (!prefix.empty()) { - req.SetPrefix(ToAwsString(prefix) + kSep); - } - req.SetDelimiter(Aws::String() + kSep); - req.SetMaxKeys(walker->max_keys_); - SpawnListObjectsV2(); - } - }; - - void WalkChild(std::string key, int32_t nesting_depth) { - ListObjectsV2Handler handler{shared_from_this(), std::move(key), nesting_depth, {}}; - handler.Start(); - } - - void WalkChildren(const S3Model::ListObjectsV2Result& result, int32_t nesting_depth) { - for (const auto& prefix : result.GetCommonPrefixes()) { - const auto child_key = - internal::RemoveTrailingSlash(FromAwsString(prefix.GetPrefix())); - WalkChild(std::string{child_key}, nesting_depth); - } - } - - friend struct ListObjectsV2Handler; -}; - } // namespace // ----------------------------------------------------------------------- @@ -1855,11 +1721,9 @@ class S3FileSystem::Impl : public std::enable_shared_from_this holder_; std::optional backend_; - const int32_t kListObjectsMaxKeys = 1000; + static constexpr int32_t kListObjectsMaxKeys = 1000; // At most 1000 keys per multiple-delete request - const int32_t kMultipleDeleteMaxKeys = 1000; - // Limit recursing depth, since a recursion bomb can be created - const int32_t kMaxNestingDepth = 100; + static constexpr int32_t kMultipleDeleteMaxKeys = 1000; explicit Impl(S3Options options, io::IOContext io_context) : builder_(std::move(options)), io_context_(io_context) {} @@ -2073,197 +1937,303 @@ class S3FileSystem::Impl : public std::enable_shared_from_this= kMaxNestingDepth) { - return Status::IOError("S3 filesystem tree exceeds maximum nesting depth (", - kMaxNestingDepth, ")"); + static FileInfo MakeDirectoryInfo(std::string dirname) { + FileInfo dir; + dir.set_type(FileType::Directory); + dir.set_path(std::move(dirname)); + return dir; + } + + static std::vector MakeDirectoryInfos(std::vector dirnames) { + std::vector dir_infos; + for (auto& dirname : dirnames) { + dir_infos.push_back(MakeDirectoryInfo(std::move(dirname))); } - return Status::OK(); + return dir_infos; } - // A helper class for Walk and WalkAsync - struct FileInfoCollector { - FileInfoCollector(std::string bucket, std::string key, const FileSelector& select) - : bucket(std::move(bucket)), - key(std::move(key)), - allow_not_found(select.allow_not_found) {} + using FileInfoSink = PushGenerator>::Producer; - Status Collect(const std::string& prefix, const S3Model::ListObjectsV2Result& result, - std::vector* out) { - // Walk "directories" - for (const auto& child_prefix : result.GetCommonPrefixes()) { - is_empty = false; - const auto child_key = - internal::RemoveTrailingSlash(FromAwsString(child_prefix.GetPrefix())); - std::stringstream child_path; - child_path << bucket << kSep << child_key; - FileInfo info; - info.set_path(child_path.str()); - info.set_type(FileType::Directory); - out->push_back(std::move(info)); + struct FileListerState { + FileInfoSink files_queue; + const bool allow_not_found; + const int max_recursion; + const bool include_implicit_dirs; + const io::IOContext io_context; + S3ClientHolder* const holder; + + S3Model::ListObjectsV2Request req; + std::unordered_set directories; + bool empty = true; + + FileListerState(PushGenerator>::Producer files_queue, + FileSelector select, const std::string& bucket, + const std::string& key, bool include_implicit_dirs, + io::IOContext io_context, S3ClientHolder* holder) + : files_queue(std::move(files_queue)), + allow_not_found(select.allow_not_found), + max_recursion(select.max_recursion), + include_implicit_dirs(include_implicit_dirs), + io_context(std::move(io_context)), + holder(holder) { + req.SetBucket(bucket); + req.SetMaxKeys(kListObjectsMaxKeys); + if (!key.empty()) { + req.SetPrefix(key + kSep); } - // Walk "files" - for (const auto& obj : result.GetContents()) { - is_empty = false; - FileInfo info; - const auto child_key = internal::RemoveTrailingSlash(FromAwsString(obj.GetKey())); - if (child_key == std::string_view(prefix)) { - // Amazon can return the "directory" key itself as part of the results, skip - continue; - } - std::stringstream child_path; - child_path << bucket << kSep << child_key; - info.set_path(child_path.str()); - FileObjectToInfo(obj, &info); - out->push_back(std::move(info)); + if (!select.recursive) { + req.SetDelimiter(Aws::String() + kSep); } - return Status::OK(); } - Status Finish(Impl* impl) { - // If no contents were found, perhaps it's an empty "directory", - // or perhaps it's a nonexistent entry. Check. - if (is_empty && !allow_not_found) { - ARROW_ASSIGN_OR_RAISE(bool is_actually_empty, - impl->IsEmptyDirectory(bucket, key)); - if (!is_actually_empty) { - return PathNotFound(bucket, key); - } + void Finish() { + // `empty` means that we didn't get a single file info back from S3. This may be + // a situation that we should consider as PathNotFound. + // + // * If the prefix is empty then we were querying the contents of an entire bucket + // and this is not a PathNotFound case because if the bucket didn't exist then + // we would have received an error and not an empty set of results. + // + // * If the prefix is not empty then we asked for all files under a particular + // directory. S3 will also return the directory itself, if it exists. So if + // we get zero results then we know that there are no files under the directory + // and the directory itself doesn't exist. This should be considered PathNotFound + if (empty && !allow_not_found && !req.GetPrefix().empty()) { + files_queue.Push(PathNotFound(req.GetBucket(), req.GetPrefix())); } - return Status::OK(); } - std::string bucket; - std::string key; - bool allow_not_found; - bool is_empty = true; + // Given a path, iterate through all possible sub-paths and, if we haven't + // seen that sub-path before, return it. + // + // For example, given A/B/C we might return A/B and A if we have not seen + // those paths before. This allows us to consider "implicit" directories which + // don't exist as objects in S3 but can be inferred. + std::vector GetNewDirectories(const std::string_view& path) { + std::string current(path); + std::string base = req.GetBucket(); + if (!req.GetPrefix().empty()) { + base = base + kSep + std::string(internal::RemoveTrailingSlash(req.GetPrefix())); + } + std::vector new_directories; + while (true) { + const std::string parent_dir = internal::GetAbstractPathParent(current).first; + if (parent_dir.empty()) { + break; + } + current = parent_dir; + if (current == base) { + break; + } + if (directories.insert(parent_dir).second) { + new_directories.push_back(std::move(parent_dir)); + } + } + return new_directories; + } }; - // Workhorse for GetFileInfo(FileSelector...) - Status Walk(const FileSelector& select, const std::string& bucket, - const std::string& key, std::vector* out) { - RETURN_NOT_OK(CheckS3Initialized()); + struct FileListerTask : public util::AsyncTaskScheduler::Task { + std::shared_ptr state; + util::AsyncTaskScheduler* scheduler; - FileInfoCollector collector(bucket, key, select); + FileListerTask(std::shared_ptr state, + util::AsyncTaskScheduler* scheduler) + : state(std::move(state)), scheduler(scheduler) {} - auto handle_error = [&](const AWSError& error) -> Status { - if (select.allow_not_found && IsNotFound(error)) { - return Status::OK(); + std::vector ToFileInfos(const std::string& bucket, + const std::string& prefix, + const S3Model::ListObjectsV2Result& result) { + std::vector file_infos; + // If this is a non-recursive listing we may see "common prefixes" which represent + // directories we did not recurse into. We will add those as directories. + for (const auto& child_prefix : result.GetCommonPrefixes()) { + const auto child_key = + internal::RemoveTrailingSlash(FromAwsString(child_prefix.GetPrefix())); + std::stringstream child_path_ss; + child_path_ss << bucket << kSep << child_key; + FileInfo info; + info.set_path(child_path_ss.str()); + info.set_type(FileType::Directory); + file_infos.push_back(std::move(info)); } - return ErrorToStatus(std::forward_as_tuple("When listing objects under key '", key, - "' in bucket '", bucket, "': "), - "ListObjectsV2", error); - }; - - auto handle_recursion = [&](int32_t nesting_depth) -> Result { - RETURN_NOT_OK(CheckNestingDepth(nesting_depth)); - return select.recursive && nesting_depth <= select.max_recursion; - }; - - auto handle_results = [&](const std::string& prefix, - const S3Model::ListObjectsV2Result& result) -> Status { - return collector.Collect(prefix, result, out); - }; - - RETURN_NOT_OK(TreeWalker::Walk(holder_, io_context_, bucket, key, kListObjectsMaxKeys, - handle_results, handle_error, handle_recursion)); - - // If no contents were found, perhaps it's an empty "directory", - // or perhaps it's a nonexistent entry. Check. - RETURN_NOT_OK(collector.Finish(this)); - // Sort results for convenience, since they can come massively out of order - std::sort(out->begin(), out->end(), FileInfo::ByPath{}); - return Status::OK(); - } - - // Workhorse for GetFileInfoGenerator(FileSelector...) - FileInfoGenerator WalkAsync(const FileSelector& select, const std::string& bucket, - const std::string& key) { - PushGenerator> gen; - auto producer = gen.producer(); - auto collector = std::make_shared(bucket, key, select); - auto self = shared_from_this(); + // S3 doesn't have any concept of "max depth" and so we emulate it by counting the + // number of '/' characters. E.g. if the user is searching bucket/subdirA/subdirB + // then the starting depth is 2. + // A file subdirA/subdirB/somefile will have a child depth of 2 and a "depth" of 0. + // A file subdirA/subdirB/subdirC/somefile will have a child depth of 3 and a + // "depth" of 1 + int base_depth = internal::GetAbstractPathDepth(prefix); + for (const auto& obj : result.GetContents()) { + if (obj.GetKey() == prefix) { + // S3 will return the basedir itself (if it is a file / empty file). We don't + // want that. But this is still considered "finding the basedir" and so we mark + // it "not empty". + state->empty = false; + continue; + } + std::string child_key = + std::string(internal::RemoveTrailingSlash(FromAwsString(obj.GetKey()))); + bool had_trailing_slash = child_key.size() != obj.GetKey().size(); + int child_depth = internal::GetAbstractPathDepth(child_key); + // Recursion depth is 1 smaller because a path with depth 1 (e.g. foo) is + // considered to have a "recursion" of 0 + int recursion_depth = child_depth - base_depth - 1; + if (recursion_depth > state->max_recursion) { + // If we have A/B/C/D and max_recursion is 2 then we ignore this (don't add it + // to file_infos) but we still want to potentially add A and A/B as directories. + // So we "pretend" like we have a file A/B/C for the call to GetNewDirectories + // below + int to_trim = recursion_depth - state->max_recursion - 1; + if (to_trim > 0) { + child_key = bucket + kSep + + internal::SliceAbstractPath(child_key, 0, child_depth - to_trim); + } else { + child_key = bucket + kSep + child_key; + } + } else { + // If the file isn't beyond our max recursion then count it as a file + // unless it's empty and then it depends on whether or not the file ends + // with a trailing slash + std::stringstream child_path_ss; + child_path_ss << bucket << kSep << child_key; + child_key = child_path_ss.str(); + if (obj.GetSize() > 0 || !had_trailing_slash) { + // We found a real file + FileInfo info; + info.set_path(child_key); + FileObjectToInfo(obj, &info); + file_infos.push_back(std::move(info)); + } else { + // We found an empty file and we want to treat it like a directory. Only + // add it if we haven't seen this directory before. + if (state->directories.insert(child_key).second) { + file_infos.push_back(MakeDirectoryInfo(child_key)); + } + } + } - auto handle_error = [select, bucket, key](const AWSError& error) -> Status { - if (select.allow_not_found && IsNotFound(error)) { - return Status::OK(); + if (state->include_implicit_dirs) { + // Now that we've dealt with the file itself we need to look at each of the + // parent paths and potentially add them as directories. For example, after + // finding a file A/B/C/D we want to consider adding directories A, A/B, and + // A/B/C. + for (const auto& newdir : state->GetNewDirectories(child_key)) { + file_infos.push_back(MakeDirectoryInfo(newdir)); + } + } } - return ErrorToStatus(std::forward_as_tuple("When listing objects under key '", key, - "' in bucket '", bucket, "': "), - "ListObjectsV2", error); - }; - - auto handle_recursion = [producer, select, - self](int32_t nesting_depth) -> Result { - if (producer.is_closed()) { - return false; + if (file_infos.size() > 0) { + state->empty = false; } - RETURN_NOT_OK(self->CheckNestingDepth(nesting_depth)); - return select.recursive && nesting_depth <= select.max_recursion; - }; + return file_infos; + } - auto handle_results = - [collector, producer]( - const std::string& prefix, - const S3Model::ListObjectsV2Result& result) mutable -> Status { - std::vector out; - RETURN_NOT_OK(collector->Collect(prefix, result, &out)); - if (!out.empty()) { - producer.Push(std::move(out)); + void Run() { + // We are on an I/O thread now so just synchronously make the call and interpret the + // results. + Result client_lock = state->holder->Lock(); + if (!client_lock.ok()) { + state->files_queue.Push(client_lock.status()); + return; } - return Status::OK(); - }; - - TreeWalker::WalkAsync(holder_, io_context_, bucket, key, kListObjectsMaxKeys, - handle_results, handle_error, handle_recursion) - .AddCallback([collector, producer, self](const Status& status) mutable { - auto st = collector->Finish(self.get()); - if (!st.ok()) { - producer.Push(st); - } - producer.Close(); - }); - return gen; - } - - struct WalkResult { - std::vector file_keys; - std::vector dir_keys; - }; - Future> WalkForDeleteDirAsync(const std::string& bucket, - const std::string& key) { - auto state = std::make_shared(); - - auto handle_results = [state](const std::string& prefix, - const S3Model::ListObjectsV2Result& result) -> Status { - // Walk "files" - state->file_keys.reserve(state->file_keys.size() + result.GetContents().size()); - for (const auto& obj : result.GetContents()) { - state->file_keys.emplace_back(FromAwsString(obj.GetKey())); + S3Model::ListObjectsV2Outcome outcome = + client_lock->Move()->ListObjectsV2(state->req); + if (!outcome.IsSuccess()) { + const auto& err = outcome.GetError(); + if (state->allow_not_found && IsNotFound(err)) { + return; + } + state->files_queue.Push( + ErrorToStatus(std::forward_as_tuple("When listing objects under key '", + state->req.GetPrefix(), "' in bucket '", + state->req.GetBucket(), "': "), + "ListObjectsV2", err)); + return; } - // Walk "directories" - state->dir_keys.reserve(state->dir_keys.size() + result.GetCommonPrefixes().size()); - for (const auto& prefix : result.GetCommonPrefixes()) { - state->dir_keys.emplace_back(FromAwsString(prefix.GetPrefix())); + const S3Model::ListObjectsV2Result& result = outcome.GetResult(); + // We could immediately schedule the continuation (if there are enough results to + // trigger paging) but that would introduce race condition complexity for arguably + // little benefit. + std::vector file_infos = + ToFileInfos(state->req.GetBucket(), state->req.GetPrefix(), result); + if (file_infos.size() > 0) { + state->files_queue.Push(std::move(file_infos)); } - return Status::OK(); - }; - auto handle_error = [=](const AWSError& error) -> Status { - return ErrorToStatus(std::forward_as_tuple("When listing objects under key '", key, - "' in bucket '", bucket, "': "), - "ListObjectsV2", error); - }; + // If there are enough files to warrant a continuation then go ahead and schedule + // that now. + if (result.GetIsTruncated()) { + DCHECK(!result.GetNextContinuationToken().empty()); + state->req.SetContinuationToken(result.GetNextContinuationToken()); + scheduler->AddTask(std::make_unique(state, scheduler)); + } else { + // Otherwise, we have finished listing all the files + state->Finish(); + } + } - auto self = shared_from_this(); - auto handle_recursion = [self](int32_t nesting_depth) -> Result { - RETURN_NOT_OK(self->CheckNestingDepth(nesting_depth)); - return true; // Recurse - }; + Result> operator()() override { + return state->io_context.executor()->Submit([this] { + Run(); + return Status::OK(); + }); + } + std::string_view name() const override { return "S3ListFiles"; } + }; - return TreeWalker::WalkAsync(holder_, io_context_, bucket, key, kListObjectsMaxKeys, - handle_results, handle_error, handle_recursion) - .Then([state]() { return state; }); + // Lists all file, potentially recursively, in a bucket + // + // include_implicit_dirs controls whether or not implicit directories should be + // included. These are directories that are not actually file objects but instead are + // inferred from other objects. + // + // For example, if a file exists with path A/B/C then implicit directories A/ and A/B/ + // will exist even if there are no file objects with these paths. + void ListAsync(const FileSelector& select, const std::string& bucket, + const std::string& key, bool include_implicit_dirs, + util::AsyncTaskScheduler* scheduler, FileInfoSink sink) { + // We can only fetch kListObjectsMaxKeys files at a time and so we create a + // scheduler and schedule a task to grab the first batch. Once that's done we + // schedule a new task for the next batch. All of these tasks share the same + // FileListerState object but none of these tasks run in parallel so there is + // no need to worry about mutexes + auto state = std::make_shared(sink, select, bucket, key, + include_implicit_dirs, io_context_, + this->holder_.get()); + + // Create the first file lister task (it may spawn more) + auto file_lister_task = std::make_unique(state, scheduler); + scheduler->AddTask(std::move(file_lister_task)); + } + + // Fully list all files from all buckets + void FullListAsync(bool include_implicit_dirs, util::AsyncTaskScheduler* scheduler, + FileInfoSink sink, bool recursive) { + scheduler->AddSimpleTask( + [this, scheduler, sink, include_implicit_dirs, recursive]() mutable { + return ListBucketsAsync().Then( + [this, scheduler, sink, include_implicit_dirs, + recursive](const std::vector& buckets) mutable { + // Return the buckets themselves as directories + std::vector buckets_as_directories = + MakeDirectoryInfos(buckets); + sink.Push(std::move(buckets_as_directories)); + + if (recursive) { + // Recursively list each bucket (these will run in parallel but sink + // should be thread safe and so this is ok) + for (const auto& bucket : buckets) { + FileSelector select; + select.allow_not_found = true; + select.recursive = true; + select.base_dir = bucket; + ListAsync(select, bucket, "", include_implicit_dirs, scheduler, sink); + } + } + }); + }, + std::string_view("FullListBucketScan")); } // Delete multiple objects at once @@ -2297,12 +2267,14 @@ class S3FileSystem::Impl : public std::enable_shared_from_this> futures; - futures.reserve(keys.size() / chunk_size + 1); + futures.reserve(bit_util::CeilDiv(keys.size(), chunk_size)); for (size_t start = 0; start < keys.size(); start += chunk_size) { S3Model::DeleteObjectsRequest req; S3Model::Delete del; - for (size_t i = start; i < std::min(keys.size(), chunk_size); ++i) { + size_t remaining = keys.size() - start; + size_t next_chunk_size = std::min(remaining, chunk_size); + for (size_t i = start; i < start + next_chunk_size; ++i) { del.AddObjects(S3Model::ObjectIdentifier().WithKey(ToAwsString(keys[i]))); } req.SetBucket(ToAwsString(bucket)); @@ -2317,37 +2289,151 @@ class S3FileSystem::Impl : public std::enable_shared_from_this& keys) { return DeleteObjectsAsync(bucket, keys).status(); } + // Check to make sure the given path is not a file + // + // Returns true if the path seems to be a directory, false if it is a file + Future EnsureIsDirAsync(const std::string& bucket, const std::string& key) { + if (key.empty()) { + // There is no way for a bucket to be a file + return Future::MakeFinished(true); + } + auto self = shared_from_this(); + return DeferNotOk( + SubmitIO(io_context_, [self, bucket, key]() mutable -> Result { + S3Model::HeadObjectRequest req; + req.SetBucket(ToAwsString(bucket)); + req.SetKey(ToAwsString(key)); + + ARROW_ASSIGN_OR_RAISE(auto client_lock, self->holder_->Lock()); + auto outcome = client_lock.Move()->HeadObject(req); + if (outcome.IsSuccess()) { + const auto& result = outcome.GetResult(); + // A directory should be empty and have a trailing slash. Anything else + // we can consider a file + return result.GetContentLength() <= 0 && key[key.size() - 1] == '/'; + } + if (IsNotFound(outcome.GetError())) { + // If we can't find it then it isn't a file. + return true; + } else { + return ErrorToStatus( + std::forward_as_tuple("When getting information for key '", key, + "' in bucket '", bucket, "': "), + "HeadObject", outcome.GetError()); + } + })); + } + + // Some operations require running multiple S3 calls, either in parallel or serially. We + // need to ensure that the S3 filesystem instance stays valid and that S3 isn't + // finalized. We do this by wrapping all the tasks in a scheduler which keeps the + // resources alive + Future<> RunInScheduler( + std::function callable) { + auto self = shared_from_this(); + FnOnce initial_task = + [callable = std::move(callable), + this](util::AsyncTaskScheduler* scheduler) mutable { + return callable(scheduler, this); + }; + Future<> scheduler_fut = util::AsyncTaskScheduler::Make( + std::move(initial_task), + /*abort_callback=*/ + [](const Status& st) { + // No need for special abort logic. + }, + io_context_.stop_token()); + // Keep self alive until all tasks finish + return scheduler_fut.Then([self]() { return Status::OK(); }); + } + + Future<> DoDeleteDirContentsAsync(const std::string& bucket, const std::string& key) { + return RunInScheduler( + [bucket, key](util::AsyncTaskScheduler* scheduler, S3FileSystem::Impl* self) { + scheduler->AddSimpleTask( + [=] { + FileSelector select; + select.base_dir = bucket + kSep + key; + select.recursive = true; + select.allow_not_found = false; + + FileInfoGenerator file_infos = self->GetFileInfoGenerator(select); + + auto handle_file_infos = [=](const std::vector& file_infos) { + std::vector file_paths; + for (const auto& file_info : file_infos) { + DCHECK_GT(file_info.path().size(), bucket.size()); + file_paths.push_back(file_info.path().substr(bucket.size() + 1)); + } + scheduler->AddSimpleTask( + [=, file_paths = std::move(file_paths)] { + return self->DeleteObjectsAsync(bucket, file_paths); + }, + std::string_view("DeleteDirContentsDeleteTask")); + return Status::OK(); + }; + + return VisitAsyncGenerator( + AsyncGenerator>(std::move(file_infos)), + std::move(handle_file_infos)); + }, + std::string_view("ListFilesForDelete")); + return Status::OK(); + }); + } + Future<> DeleteDirContentsAsync(const std::string& bucket, const std::string& key) { auto self = shared_from_this(); - return WalkForDeleteDirAsync(bucket, key) - .Then([bucket, key, - self](const std::shared_ptr& discovered) -> Future<> { - if (discovered->file_keys.empty() && discovered->dir_keys.empty() && - !key.empty()) { - // No contents found, is it an empty directory? - ARROW_ASSIGN_OR_RAISE(bool exists, self->IsEmptyDirectory(bucket, key)); - if (!exists) { - return PathNotFound(bucket, key); - } + return EnsureIsDirAsync(bucket, key) + .Then([self, bucket, key](bool is_dir) -> Future<> { + if (!is_dir) { + return Status::IOError("Cannot delete directory contents at ", bucket, kSep, + key, " because it is a file"); } - // First delete all "files", then delete all child "directories" - return self->DeleteObjectsAsync(bucket, discovered->file_keys) - .Then([bucket, discovered, self]() { - // Delete directories in reverse lexicographic order, to ensure children - // are deleted before their parents (Minio). - std::sort(discovered->dir_keys.rbegin(), discovered->dir_keys.rend()); - return self->DeleteObjectsAsync(bucket, discovered->dir_keys); - }); + return self->DoDeleteDirContentsAsync(bucket, key); }); } + FileInfoGenerator GetFileInfoGenerator(const FileSelector& select) { + auto maybe_base_path = S3Path::FromString(select.base_dir); + if (!maybe_base_path.ok()) { + return MakeFailingGenerator(maybe_base_path.status()); + } + auto base_path = *std::move(maybe_base_path); + + PushGenerator> generator; + Future<> scheduler_fut = RunInScheduler( + [select, base_path, sink = generator.producer()]( + util::AsyncTaskScheduler* scheduler, S3FileSystem::Impl* self) { + if (base_path.empty()) { + bool should_recurse = select.recursive && select.max_recursion > 0; + self->FullListAsync(/*include_implicit_dirs=*/true, scheduler, sink, + should_recurse); + } else { + self->ListAsync(select, base_path.bucket, base_path.key, + /*include_implicit_dirs=*/true, scheduler, sink); + } + return Status::OK(); + }); + + // Mark the generator done once all tasks are finished + scheduler_fut.AddCallback([sink = generator.producer()](const Status& st) mutable { + if (!st.ok()) { + sink.Push(st); + } + sink.Close(); + }); + + return generator; + } + Status EnsureDirectoryExists(const S3Path& path) { if (!path.key.empty()) { return CreateEmptyDir(path.bucket, path.key); @@ -2381,13 +2467,13 @@ class S3FileSystem::Impl : public std::enable_shared_from_thisListBuckets()); } - Future> ListBucketsAsync(io::IOContext ctx) { + Future> ListBucketsAsync() { auto deferred = [self = shared_from_this()]() mutable -> Result> { ARROW_ASSIGN_OR_RAISE(auto client_lock, self->holder_->Lock()); return self->ProcessListBuckets(client_lock.Move()->ListBuckets()); }; - return DeferNotOk(SubmitIO(ctx, std::move(deferred))); + return DeferNotOk(SubmitIO(io_context_, std::move(deferred))); } Result> OpenInputFile(const std::string& s, @@ -2527,73 +2613,19 @@ Result S3FileSystem::GetFileInfo(const std::string& s) { } Result S3FileSystem::GetFileInfo(const FileSelector& select) { - ARROW_ASSIGN_OR_RAISE(auto base_path, S3Path::FromString(select.base_dir)); - - FileInfoVector results; - - if (base_path.empty()) { - // List all buckets - ARROW_ASSIGN_OR_RAISE(auto buckets, impl_->ListBuckets()); - for (const auto& bucket : buckets) { - FileInfo info; - info.set_path(bucket); - info.set_type(FileType::Directory); - results.push_back(std::move(info)); - if (select.recursive) { - RETURN_NOT_OK(impl_->Walk(select, bucket, "", &results)); - } - } - return results; + Future> file_infos_fut = + CollectAsyncGenerator(GetFileInfoGenerator(select)); + ARROW_ASSIGN_OR_RAISE(std::vector file_infos, file_infos_fut.result()); + FileInfoVector combined_file_infos; + for (const auto& file_info_vec : file_infos) { + combined_file_infos.insert(combined_file_infos.end(), file_info_vec.begin(), + file_info_vec.end()); } - - // Nominal case -> walk a single bucket - RETURN_NOT_OK(impl_->Walk(select, base_path.bucket, base_path.key, &results)); - return results; + return combined_file_infos; } FileInfoGenerator S3FileSystem::GetFileInfoGenerator(const FileSelector& select) { - auto maybe_base_path = S3Path::FromString(select.base_dir); - if (!maybe_base_path.ok()) { - return MakeFailingGenerator(maybe_base_path.status()); - } - auto base_path = *std::move(maybe_base_path); - - if (base_path.empty()) { - // List all buckets, then possibly recurse - PushGenerator> gen; - auto producer = gen.producer(); - - auto fut = impl_->ListBucketsAsync(io_context()); - auto impl = impl_->shared_from_this(); - fut.AddCallback( - [producer, select, impl](const Result>& res) mutable { - if (!res.ok()) { - producer.Push(res.status()); - producer.Close(); - return; - } - FileInfoVector buckets; - for (const auto& bucket : *res) { - buckets.push_back(FileInfo{bucket, FileType::Directory}); - } - // Generate all bucket infos - auto buckets_fut = Future::MakeFinished(std::move(buckets)); - producer.Push(MakeSingleFutureGenerator(buckets_fut)); - if (select.recursive) { - // Generate recursive walk for each bucket in turn - for (const auto& bucket : *buckets_fut.result()) { - producer.Push(impl->WalkAsync(select, bucket.path(), "")); - } - } - producer.Close(); - }); - - return MakeConcatenatedGenerator( - AsyncGenerator>{std::move(gen)}); - } - - // Nominal case -> walk a single bucket - return impl_->WalkAsync(select, base_path.bucket, base_path.key); + return impl_->GetFileInfoGenerator(select); } Status S3FileSystem::CreateDir(const std::string& s, bool recursive) { diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc b/cpp/src/arrow/filesystem/s3fs_test.cc index 718304abaed63..e9f14fde72316 100644 --- a/cpp/src/arrow/filesystem/s3fs_test.cc +++ b/cpp/src/arrow/filesystem/s3fs_test.cc @@ -66,13 +66,17 @@ #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" +#include "arrow/util/range.h" +#include "arrow/util/string.h" namespace arrow { namespace fs { using ::arrow::internal::checked_pointer_cast; using ::arrow::internal::PlatformFilename; +using ::arrow::internal::ToChars; using ::arrow::internal::UriEscape; +using ::arrow::internal::Zip; using ::arrow::fs::internal::ConnectRetryStrategy; using ::arrow::fs::internal::ErrorToStatus; @@ -459,16 +463,19 @@ class TestS3FS : public S3TestMixin { } } - void MakeFileSystem() { + Result> MakeNewFileSystem( + io::IOContext io_context = io::default_io_context()) { options_.ConfigureAccessKey(minio_->access_key(), minio_->secret_key()); options_.scheme = "http"; options_.endpoint_override = minio_->connect_string(); if (!options_.retry_strategy) { options_.retry_strategy = std::make_shared(); } - ASSERT_OK_AND_ASSIGN(fs_, S3FileSystem::Make(options_)); + return S3FileSystem::Make(options_, io_context); } + void MakeFileSystem() { ASSERT_OK_AND_ASSIGN(fs_, MakeNewFileSystem()); } + template void AssertMetadataRoundtrip(const std::string& path, const std::shared_ptr& metadata, @@ -787,6 +794,81 @@ TEST_F(TestS3FS, GetFileInfoGenerator) { // Non-root dir case is tested by generic tests } +TEST_F(TestS3FS, GetFileInfoGeneratorStress) { + // This test is slow because it needs to create a bunch of seed files. However, it is + // the only test that stresses listing and deleting when there are more than 1000 files + // and paging is required. + constexpr int32_t kNumDirs = 4; + constexpr int32_t kNumFilesPerDir = 512; + FileInfoVector expected_infos; + + ASSERT_OK(fs_->CreateDir("stress")); + for (int32_t i = 0; i < kNumDirs; i++) { + const std::string dir_path = "stress/" + ToChars(i); + ASSERT_OK(fs_->CreateDir(dir_path)); + expected_infos.emplace_back(dir_path, FileType::Directory); + + std::vector> tasks; + for (int32_t j = 0; j < kNumFilesPerDir; j++) { + // Create the files in parallel in hopes of speeding up this process as much as + // possible + const std::string file_name = ToChars(j); + const std::string file_path = dir_path + "/" + file_name; + expected_infos.emplace_back(file_path, FileType::File); + ASSERT_OK_AND_ASSIGN(Future<> task, + ::arrow::internal::GetCpuThreadPool()->Submit( + [fs = fs_, file_name, file_path]() -> Status { + ARROW_ASSIGN_OR_RAISE( + std::shared_ptr out_str, + fs->OpenOutputStream(file_path)); + ARROW_RETURN_NOT_OK(out_str->Write(file_name)); + return out_str->Close(); + })); + tasks.push_back(std::move(task)); + } + ASSERT_FINISHES_OK(AllFinished(tasks)); + } + SortInfos(&expected_infos); + + FileSelector select; + FileInfoVector infos; + select.base_dir = "stress"; + select.recursive = true; + + // 32 is pretty fast, listing is much faster than the create step above + constexpr int32_t kNumTasks = 32; + for (int i = 0; i < kNumTasks; i++) { + CollectFileInfoGenerator(fs_->GetFileInfoGenerator(select), &infos); + SortInfos(&infos); + // One info for each directory and one info for each file + ASSERT_EQ(infos.size(), expected_infos.size()); + for (const auto&& [info, expected] : Zip(infos, expected_infos)) { + AssertFileInfo(info, expected.path(), expected.type()); + } + } + + ASSERT_OK(fs_->DeleteDirContents("stress")); + + CollectFileInfoGenerator(fs_->GetFileInfoGenerator(select), &infos); + ASSERT_EQ(infos.size(), 0); +} + +TEST_F(TestS3FS, GetFileInfoGeneratorCancelled) { + FileSelector select; + FileInfoVector infos; + select.base_dir = "bucket"; + select.recursive = true; + + StopSource stop_source; + io::IOContext cancellable_context(stop_source.token()); + ASSERT_OK_AND_ASSIGN(std::shared_ptr cancellable_fs, + MakeNewFileSystem(cancellable_context)); + stop_source.RequestStop(); + FileInfoGenerator generator = cancellable_fs->GetFileInfoGenerator(select); + auto file_infos = CollectAsyncGenerator(std::move(generator)); + ASSERT_FINISHES_AND_RAISES(Cancelled, file_infos); +} + TEST_F(TestS3FS, CreateDir) { FileInfo st; diff --git a/cpp/src/arrow/util/async_util.cc b/cpp/src/arrow/util/async_util.cc index 55627eb43bbcf..63e27bfbe5773 100644 --- a/cpp/src/arrow/util/async_util.cc +++ b/cpp/src/arrow/util/async_util.cc @@ -201,10 +201,14 @@ class AsyncTaskSchedulerImpl : public AsyncTaskScheduler { } // Capture `task` to keep it alive until finished if (!submit_result->TryAddCallback([this, task_inner = std::move(task)]() mutable { - return [this, task_inner2 = std::move(task_inner)](const Status& st) { + return [this, task_inner2 = std::move(task_inner)](const Status& st) mutable { #ifdef ARROW_WITH_OPENTELEMETRY TraceTaskFinished(task_inner2.get()); #endif + // OnTaskFinished might trigger the scheduler to end. We want to ensure that + // is the very last thing that happens after all task destructors have run so + // we eagerly destroy the task first. + task_inner2.reset(); OnTaskFinished(st); }; })) { diff --git a/cpp/src/arrow/util/async_util_test.cc b/cpp/src/arrow/util/async_util_test.cc index 7734b84c9ebaf..313ca91912335 100644 --- a/cpp/src/arrow/util/async_util_test.cc +++ b/cpp/src/arrow/util/async_util_test.cc @@ -204,6 +204,29 @@ TEST(AsyncTaskScheduler, InitialTaskFails) { ASSERT_FINISHES_AND_RAISES(Invalid, finished); } +TEST(AsyncTaskScheduler, TaskDestroyedBeforeSchedulerEnds) { + bool my_task_destroyed = false; + Future<> task_fut = Future<>::Make(); + struct DestroyTrackingTask : public AsyncTaskScheduler::Task { + DestroyTrackingTask(bool& my_task_destroyed, Future<> task_fut) + : my_task_destroyed(my_task_destroyed), task_fut(std::move(task_fut)) {} + ~DestroyTrackingTask() override { my_task_destroyed = true; } + std::string_view name() const override { return "DestroyTrackingTask"; } + Result> operator()() override { return task_fut; } + bool& my_task_destroyed; + Future<> task_fut; + }; + Future<> finished = AsyncTaskScheduler::Make([&](AsyncTaskScheduler* scheduler) { + scheduler->AddTask(std::make_unique( + my_task_destroyed, task_fut)); + return Status::OK(); + }).Then([&] { ASSERT_TRUE(my_task_destroyed); }); + ASSERT_FALSE(my_task_destroyed); + task_fut.MarkFinished(); + ASSERT_FINISHES_OK(finished); + ASSERT_TRUE(my_task_destroyed); +} + TEST(AsyncTaskScheduler, TaskGroup) { Future<> task = Future<>::Make(); bool finish_callback_ran = false; From ec2bc346b85b0ffd49dff49bf29552e45e02ca98 Mon Sep 17 00:00:00 2001 From: Thor <8681572+thorfour@users.noreply.github.com> Date: Tue, 25 Jul 2023 11:42:26 -0500 Subject: [PATCH 045/749] GH-36850: [Go] Arrow Concatenate fix, ensure allocations are Free'd (#36854) ### Rationale for this change The Concatenate function would capture panic's and return errors, however it wouldn't ensure that any allocations that happened in the `concat` sub function were cleaned up upon doing so. This change moves the `recover()` step into the `concat` function and will call `Release()` on the data object in the case of panic or error. ### What changes are included in this PR? ### Are these changes tested? A test called `TestConcatPanic` was added that causes the allocator to throw a panic part way during a concatenation, and ensures that the checked allocator still returns 0. ### Are there any user-facing changes? * Closes: #36850 Authored-by: thorfour Signed-off-by: Matt Topol --- go/arrow/array/concat.go | 28 ++++++++++++------------ go/arrow/array/concat_test.go | 40 +++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 13 deletions(-) diff --git a/go/arrow/array/concat.go b/go/arrow/array/concat.go index 838fe91d0accc..5d3d2e005d488 100644 --- a/go/arrow/array/concat.go +++ b/go/arrow/array/concat.go @@ -41,17 +41,6 @@ func Concatenate(arrs []arrow.Array, mem memory.Allocator) (result arrow.Array, return nil, errors.New("array/concat: must pass at least one array") } - defer func() { - if pErr := recover(); pErr != nil { - switch e := pErr.(type) { - case error: - err = fmt.Errorf("arrow/concat: %w", e) - default: - err = fmt.Errorf("arrow/concat: %v", pErr) - } - } - }() - // gather Data of inputs data := make([]arrow.ArrayData, len(arrs)) for i, ar := range arrs { @@ -368,8 +357,21 @@ func concatOffsets(buffers []*memory.Buffer, byteWidth int, mem memory.Allocator // concat is the implementation for actually performing the concatenation of the arrow.ArrayData // objects that we can call internally for nested types. -func concat(data []arrow.ArrayData, mem memory.Allocator) (arrow.ArrayData, error) { +func concat(data []arrow.ArrayData, mem memory.Allocator) (arr arrow.ArrayData, err error) { out := &Data{refCount: 1, dtype: data[0].DataType(), nulls: 0} + defer func() { + if pErr := recover(); pErr != nil { + switch e := pErr.(type) { + case error: + err = fmt.Errorf("arrow/concat: %w", e) + default: + err = fmt.Errorf("arrow/concat: %v", pErr) + } + } + if err != nil { + out.Release() + } + }() for _, d := range data { out.length += d.Len() if out.nulls == UnknownNullCount || d.NullN() == UnknownNullCount { @@ -445,8 +447,8 @@ func concat(data []arrow.ArrayData, mem memory.Allocator) (arrow.ArrayData, erro if err != nil { return nil, err } - out.buffers[2] = concatBuffers(gatherBufferRanges(data, 2, valueRanges), mem) out.buffers[1] = offsetBuffer + out.buffers[2] = concatBuffers(gatherBufferRanges(data, 2, valueRanges), mem) case *arrow.ListType: offsetWidth := dt.Layout().Buffers[1].ByteWidth offsetBuffer, valueRanges, err := concatOffsets(gatherFixedBuffers(data, 1, offsetWidth), offsetWidth, mem) diff --git a/go/arrow/array/concat_test.go b/go/arrow/array/concat_test.go index 6cf86883d1520..3c1cb4c3d0812 100644 --- a/go/arrow/array/concat_test.go +++ b/go/arrow/array/concat_test.go @@ -743,3 +743,43 @@ func TestConcatOverflowRunEndEncoding(t *testing.T) { }) } } + +type panicAllocator struct { + n int + memory.Allocator +} + +func (p *panicAllocator) Allocate(size int) []byte { + if size > p.n { + panic("panic allocator") + } + return p.Allocator.Allocate(size) +} + +func (p *panicAllocator) Reallocate(size int, b []byte) []byte { + return p.Allocator.Reallocate(size, b) +} + +func (p *panicAllocator) Free(b []byte) { + p.Allocator.Free(b) +} + +func TestConcatPanic(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.DefaultAllocator) + defer mem.AssertSize(t, 0) + + allocator := &panicAllocator{ + n: 400, + Allocator: mem, + } + + g := gen.NewRandomArrayGenerator(0, memory.DefaultAllocator) + ar1 := g.ArrayOf(arrow.STRING, 32, 0) + defer ar1.Release() + ar2 := g.ArrayOf(arrow.STRING, 32, 0) + defer ar2.Release() + + concat, err := array.Concatenate([]arrow.Array{ar1, ar2}, allocator) + assert.Error(t, err) + assert.Nil(t, concat) +} From d9b90035ddf273fa547d34ad2f4b04c15af3704a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Tue, 25 Jul 2023 18:49:26 +0200 Subject: [PATCH 046/749] GH-36863: [C#][Packaging] Do not shutdown PythonEngine on CDataInterfacePythonTests if .NET is > 5.0 (#36868) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Tests are failing on maintenance branch to generate Nuget packages. This has been tested on the maintenance branch and it solves the issue. ### What changes are included in this PR? Only Shutdown if `#if !NET5_0_OR_GREATER` ### Are these changes tested? Locally and via archery. ### Are there any user-facing changes? No * Closes: #36863 Authored-by: Raúl Cumplido Signed-off-by: Raúl Cumplido --- csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs b/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs index 4c53b98e3d9f1..86d7ff52cdf09 100644 --- a/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs +++ b/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs @@ -60,7 +60,9 @@ public PythonNet() public void Dispose() { +#if !NET5_0_OR_GREATER PythonEngine.Shutdown(); +#endif } } From adfd482518cf2c7ff620cc1986ad38e51b378e1f Mon Sep 17 00:00:00 2001 From: Thor <8681572+thorfour@users.noreply.github.com> Date: Tue, 25 Jul 2023 14:09:49 -0500 Subject: [PATCH 047/749] GH-36671: [Go] BinaryMemoTable optimize allocations of GetOrInsert (#36811) ### Rationale for this change The hashing.MemoTable provides an interface with GetOrInsert(val interface{}) (idx int, existed bool, err error) This can cause a costly allocation for binary dictionaries as is detailed in issue https://github.com/apache/arrow/issues/36671 If we expand the MemoTable interface to include: GetOrInsertBytes(val []byte) (idx int, existed bool, err error) We can avoid the allocations at runtime to convert from `[]byte` to `interface{}` ### What changes are included in this PR? No logic was changed with the BinaryMemoTable but instead the same API for `GetOrInsert` was copied to a type specific API of `GetOrInsertBytes`. The `BinaryDictionaryBuilder` now simple calls these bytes methods instead of the generic `interface{}` ones. ### Are these changes tested? No additional tests were included as the current tests exercise this code. ### Are there any user-facing changes? * Closes: #36671 Authored-by: thorfour Signed-off-by: Matt Topol --- go/arrow/array/dictionary.go | 20 ++++++-- go/arrow/array/dictionary_test.go | 19 +++++++ go/internal/hashing/xxh3_memo_table.gen.go | 50 +++++++++++++++++++ .../hashing/xxh3_memo_table.gen.go.tmpl | 6 +++ go/internal/hashing/xxh3_memo_table.go | 22 ++++++++ 5 files changed, 114 insertions(+), 3 deletions(-) diff --git a/go/arrow/array/dictionary.go b/go/arrow/array/dictionary.go index da1aea50b24f8..ccb6f32321496 100644 --- a/go/arrow/array/dictionary.go +++ b/go/arrow/array/dictionary.go @@ -842,6 +842,11 @@ func (b *dictionaryBuilder) insertDictValue(val interface{}) error { return err } +func (b *dictionaryBuilder) insertDictBytes(val []byte) error { + _, _, err := b.memoTable.GetOrInsertBytes(val) + return err +} + func (b *dictionaryBuilder) appendValue(val interface{}) error { idx, _, err := b.memoTable.GetOrInsert(val) b.idxBuilder.Append(idx) @@ -849,6 +854,13 @@ func (b *dictionaryBuilder) appendValue(val interface{}) error { return err } +func (b *dictionaryBuilder) appendBytes(val []byte) error { + idx, _, err := b.memoTable.GetOrInsertBytes(val) + b.idxBuilder.Append(idx) + b.length += 1 + return err +} + func getvalFn(arr arrow.Array) func(i int) interface{} { switch typedarr := arr.(type) { case *Int8: @@ -1285,16 +1297,18 @@ func (b *BinaryDictionaryBuilder) Append(v []byte) error { b.AppendNull() return nil } - return b.appendValue(v) + + return b.appendBytes(v) } -func (b *BinaryDictionaryBuilder) AppendString(v string) error { return b.appendValue(v) } + +func (b *BinaryDictionaryBuilder) AppendString(v string) error { return b.appendBytes([]byte(v)) } func (b *BinaryDictionaryBuilder) InsertDictValues(arr *Binary) (err error) { if !arrow.TypeEqual(arr.DataType(), b.dt.ValueType) { return fmt.Errorf("dictionary insert type mismatch: cannot insert values of type %T to dictionary type %T", arr.DataType(), b.dt.ValueType) } for i := 0; i < arr.Len(); i++ { - if err = b.insertDictValue(arr.Value(i)); err != nil { + if err = b.insertDictBytes(arr.Value(i)); err != nil { break } } diff --git a/go/arrow/array/dictionary_test.go b/go/arrow/array/dictionary_test.go index 8bb9edebf89bc..99c8e6ffcd47b 100644 --- a/go/arrow/array/dictionary_test.go +++ b/go/arrow/array/dictionary_test.go @@ -19,6 +19,7 @@ package array_test import ( "fmt" "math" + "math/rand" "reflect" "strings" "testing" @@ -1846,3 +1847,21 @@ func TestBinaryDictionaryPanic(t *testing.T) { }() assert.True(t, allocator.paniced) } + +func BenchmarkBinaryDictionaryBuilder(b *testing.B) { + mem := memory.NewCheckedAllocator(memory.DefaultAllocator) + defer mem.AssertSize(b, 0) + + dictType := &arrow.DictionaryType{IndexType: &arrow.Int32Type{}, ValueType: arrow.BinaryTypes.String} + bldr := array.NewDictionaryBuilder(mem, dictType) + defer bldr.Release() + + randString := func() string { + return fmt.Sprintf("test-%d", rand.Intn(30)) + } + + builder := bldr.(*array.BinaryDictionaryBuilder) + for i := 0; i < b.N; i++ { + assert.NoError(b, builder.AppendString(randString())) + } +} diff --git a/go/internal/hashing/xxh3_memo_table.gen.go b/go/internal/hashing/xxh3_memo_table.gen.go index 0c36aee950f83..f561c5f30f895 100644 --- a/go/internal/hashing/xxh3_memo_table.gen.go +++ b/go/internal/hashing/xxh3_memo_table.gen.go @@ -298,6 +298,11 @@ func (s *Int8MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err e return } +// GetOrInsertBytes is unimplemented +func (s *Int8MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { + panic("unimplemented") +} + type payloadUint8 struct { val uint8 memoIdx int32 @@ -570,6 +575,11 @@ func (s *Uint8MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err return } +// GetOrInsertBytes is unimplemented +func (s *Uint8MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { + panic("unimplemented") +} + type payloadInt16 struct { val int16 memoIdx int32 @@ -842,6 +852,11 @@ func (s *Int16MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err return } +// GetOrInsertBytes is unimplemented +func (s *Int16MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { + panic("unimplemented") +} + type payloadUint16 struct { val uint16 memoIdx int32 @@ -1114,6 +1129,11 @@ func (s *Uint16MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err return } +// GetOrInsertBytes is unimplemented +func (s *Uint16MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { + panic("unimplemented") +} + type payloadInt32 struct { val int32 memoIdx int32 @@ -1386,6 +1406,11 @@ func (s *Int32MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err return } +// GetOrInsertBytes is unimplemented +func (s *Int32MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { + panic("unimplemented") +} + type payloadInt64 struct { val int64 memoIdx int32 @@ -1658,6 +1683,11 @@ func (s *Int64MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err return } +// GetOrInsertBytes is unimplemented +func (s *Int64MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { + panic("unimplemented") +} + type payloadUint32 struct { val uint32 memoIdx int32 @@ -1930,6 +1960,11 @@ func (s *Uint32MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err return } +// GetOrInsertBytes is unimplemented +func (s *Uint32MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { + panic("unimplemented") +} + type payloadUint64 struct { val uint64 memoIdx int32 @@ -2202,6 +2237,11 @@ func (s *Uint64MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err return } +// GetOrInsertBytes is unimplemented +func (s *Uint64MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { + panic("unimplemented") +} + type payloadFloat32 struct { val float32 memoIdx int32 @@ -2493,6 +2533,11 @@ func (s *Float32MemoTable) GetOrInsert(val interface{}) (idx int, found bool, er return } +// GetOrInsertBytes is unimplemented +func (s *Float32MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { + panic("unimplemented") +} + type payloadFloat64 struct { val float64 memoIdx int32 @@ -2781,3 +2826,8 @@ func (s *Float64MemoTable) GetOrInsert(val interface{}) (idx int, found bool, er } return } + +// GetOrInsertBytes is unimplemented +func (s *Float64MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { + panic("unimplemented") +} diff --git a/go/internal/hashing/xxh3_memo_table.gen.go.tmpl b/go/internal/hashing/xxh3_memo_table.gen.go.tmpl index 94c893b94b314..10127c43cc6b1 100644 --- a/go/internal/hashing/xxh3_memo_table.gen.go.tmpl +++ b/go/internal/hashing/xxh3_memo_table.gen.go.tmpl @@ -340,4 +340,10 @@ func (s *{{.Name}}MemoTable) GetOrInsert(val interface{}) (idx int, found bool, } return } + + +// GetOrInsertBytes is unimplemented +func (s *{{.Name}}MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { + panic("unimplemented") +} {{end}} diff --git a/go/internal/hashing/xxh3_memo_table.go b/go/internal/hashing/xxh3_memo_table.go index 67e2aef380488..81994f0a88541 100644 --- a/go/internal/hashing/xxh3_memo_table.go +++ b/go/internal/hashing/xxh3_memo_table.go @@ -53,6 +53,12 @@ type MemoTable interface { // the table (if false, the value was inserted). An error is returned // if val is not the appropriate type for the table. GetOrInsert(val interface{}) (idx int, existed bool, err error) + // GetOrInsertBytes returns the index of the table the specified value is, + // and a boolean indicating whether or not the value was found in + // the table (if false, the value was inserted). An error is returned + // if val is not the appropriate type for the table. This function is intended to be used by + // the BinaryMemoTable to prevent uncessary allocations of the data when converting from a []byte to interface{}. + GetOrInsertBytes(val []byte) (idx int, existed bool, err error) // GetOrInsertNull returns the index of the null value in the table, // inserting one if it hasn't already been inserted. It returns a boolean // indicating if the null value already existed or not in the table. @@ -231,6 +237,22 @@ func (b *BinaryMemoTable) Get(val interface{}) (int, bool) { return KeyNotFound, false } +// GetOrInsertBytes returns the index of the given value in the table, if not found +// it is inserted into the table. The return value 'found' indicates whether the value +// was found in the table (true) or inserted (false) along with any possible error. +func (b *BinaryMemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { + h := Hash(val, 0) + p, found := b.lookup(h, val) + if found { + idx = int(p.payload.val) + } else { + idx = b.Size() + b.builder.Append(val) + b.tbl.Insert(p, h, int32(idx), -1) + } + return +} + // GetOrInsert returns the index of the given value in the table, if not found // it is inserted into the table. The return value 'found' indicates whether the value // was found in the table (true) or inserted (false) along with any possible error. From 5a9240fd28e1d7261a10a4816301d22995cfb8e0 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Tue, 25 Jul 2023 16:14:51 -0400 Subject: [PATCH 048/749] MINOR: Fix build double declaration (#36876) ### Rationale for this change #36811 and #36854 both introduced a helper in their tests, but the result after merge was a collision causing it to be declared twice which made the Go build fail. By removing one of the duplicate declarations the build is fixed. Authored-by: Matt Topol Signed-off-by: Matt Topol --- go/arrow/array/concat_test.go | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/go/arrow/array/concat_test.go b/go/arrow/array/concat_test.go index 3c1cb4c3d0812..a74166541e856 100644 --- a/go/arrow/array/concat_test.go +++ b/go/arrow/array/concat_test.go @@ -744,26 +744,6 @@ func TestConcatOverflowRunEndEncoding(t *testing.T) { } } -type panicAllocator struct { - n int - memory.Allocator -} - -func (p *panicAllocator) Allocate(size int) []byte { - if size > p.n { - panic("panic allocator") - } - return p.Allocator.Allocate(size) -} - -func (p *panicAllocator) Reallocate(size int, b []byte) []byte { - return p.Allocator.Reallocate(size, b) -} - -func (p *panicAllocator) Free(b []byte) { - p.Allocator.Free(b) -} - func TestConcatPanic(t *testing.T) { mem := memory.NewCheckedAllocator(memory.DefaultAllocator) defer mem.AssertSize(t, 0) From 2f039f1e0d53b793c634078379ac031c6c55d53d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Wed, 26 Jul 2023 10:05:58 +0200 Subject: [PATCH 049/749] GH-36863: [C#] Remove unnecessary applied fix to not shutdown PythonEngine on CDataInterfacePythonTests if .NET is > 5.0 (#36872) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Maintenance branch fix is not necessary. ### What changes are included in this PR? Remove unnecessary patch ### Are these changes tested? Yes, on CI ### Are there any user-facing changes? No * Closes: #36863 Authored-by: Raúl Cumplido Signed-off-by: Raúl Cumplido --- csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs | 2 -- 1 file changed, 2 deletions(-) diff --git a/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs b/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs index 86d7ff52cdf09..4c53b98e3d9f1 100644 --- a/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs +++ b/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs @@ -60,9 +60,7 @@ public PythonNet() public void Dispose() { -#if !NET5_0_OR_GREATER PythonEngine.Shutdown(); -#endif } } From 52ac7185e32f395927d5b7f2aee102ceb8485a98 Mon Sep 17 00:00:00 2001 From: 0x26res Date: Wed, 26 Jul 2023 14:27:16 +0100 Subject: [PATCH 050/749] GH-36809: [Python] MapScalar.as_py with custom field name (#36830) ### Rationale for this change `MapScalar.as_py` doesn't take into account custom key/value field names ### What changes are included in this PR? Fix and tests ### Are these changes tested? Simple unit test ### Are there any user-facing changes? No API changes. * Closes: #36809 Authored-by: aandres Signed-off-by: Sutou Kouhei --- python/pyarrow/scalar.pxi | 4 ++-- python/pyarrow/tests/test_scalars.py | 23 +++++++++++++++++++++++ 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index f438c8847bb02..74f5aa4213ca0 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -785,7 +785,7 @@ cdef class MapScalar(ListScalar): if arr is None: raise IndexError(i) dct = arr[_normalize_index(i, len(arr))] - return (dct['key'], dct['value']) + return (dct[self.type.key_field.name], dct[self.type.item_field.name]) def __iter__(self): """ @@ -794,7 +794,7 @@ cdef class MapScalar(ListScalar): arr = self.values if array is None: raise StopIteration - for k, v in zip(arr.field('key'), arr.field('value')): + for k, v in zip(arr.field(self.type.key_field.name), arr.field(self.type.item_field.name)): yield (k.as_py(), v.as_py()) def as_py(self): diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index b7180e5250fdf..2aaefe16ae469 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -791,3 +791,26 @@ def test_union(): assert arr[0].as_py() == b'a' assert arr[5].type_code == 1 assert arr[5].as_py() == 3 + + +def test_map_scalar_as_py_with_custom_field_name(): + """ + Check we can call `MapScalar.as_py` with custom field names + + See https://github.com/apache/arrow/issues/36809 + """ + assert pa.scalar( + [("foo", "bar")], + pa.map_( + pa.string(), + pa.string() + ), + ).as_py() == [("foo", "bar")] + + assert pa.scalar( + [("foo", "bar")], + pa.map_( + pa.field("custom_key", pa.string(), nullable=False), + pa.field("custom_value", pa.string()), + ), + ).as_py() == [("foo", "bar")] From 8503c869fa80090bc849bae0b4e68a84605ea82e Mon Sep 17 00:00:00 2001 From: h-vetinari Date: Thu, 27 Jul 2023 00:45:29 +1100 Subject: [PATCH 051/749] GH-35658: [Packaging] Sync conda recipes with feedstocks (#35637) Corresponds to https://github.com/conda-forge/arrow-cpp-feedstock/pull/1053 for pyarrow (one failing test on our infra that needs debugging), as well as the state of the feedstock for r-arrow after https://github.com/conda-forge/r-arrow-feedstock/pull/65 * Closes: #35658 Authored-by: H. Vetinari Signed-off-by: Sutou Kouhei --- ...> linux_64_cuda_compiler_version11.2.yaml} | 33 +++-- .../linux_64_cuda_compiler_versionNone.yaml | 29 ++-- ...nux_aarch64_cuda_compiler_version11.2.yaml | 91 ++++++++++++ ...nux_aarch64_cuda_compiler_versionNone.yaml | 29 ++-- ...nux_ppc64le_cuda_compiler_version11.2.yaml | 87 ++++++++++++ ...nux_ppc64le_cuda_compiler_versionNone.yaml | 29 ++-- .../conda-recipes/.ci_support/osx_64_.yaml | 24 ++-- .../conda-recipes/.ci_support/osx_arm64_.yaml | 24 ++-- .../.ci_support/r/linux_64_r_base4.1.yaml | 27 ++++ .../.ci_support/r/linux_64_r_base4.2.yaml | 4 +- .../r/linux_aarch64_r_base4.1.yaml | 31 ++++ .../r/linux_aarch64_r_base4.2.yaml | 4 +- .../.ci_support/r/osx_64_r_base4.1.yaml | 27 ++++ .../.ci_support/r/osx_64_r_base4.2.yaml | 4 +- .../.ci_support/r/osx_arm64_r_base4.1.yaml | 27 ++++ .../.ci_support/r/osx_arm64_r_base4.2.yaml | 4 +- ... => win_64_cuda_compiler_version11.2.yaml} | 26 ++-- .../win_64_cuda_compiler_versionNone.yaml | 26 ++-- dev/tasks/conda-recipes/arrow-cpp/activate.sh | 74 ++++++++-- .../{bld-arrow.bat => build-arrow.bat} | 14 +- .../conda-recipes/arrow-cpp/build-arrow.sh | 43 +++--- .../{bld-pyarrow.bat => build-pyarrow.bat} | 14 +- .../conda-recipes/arrow-cpp/build-pyarrow.sh | 5 +- dev/tasks/conda-recipes/arrow-cpp/meta.yaml | 134 +++++++++++------- dev/tasks/conda-recipes/azure.linux.yml | 23 +++ dev/tasks/conda-recipes/r-arrow/configure.win | 4 +- dev/tasks/conda-recipes/r-arrow/meta.yaml | 2 - dev/tasks/tasks.yml | 68 ++++++++- 28 files changed, 687 insertions(+), 220 deletions(-) rename dev/tasks/conda-recipes/.ci_support/{linux_64_cuda_compiler_version10.2.yaml => linux_64_cuda_compiler_version11.2.yaml} (79%) create mode 100644 dev/tasks/conda-recipes/.ci_support/linux_aarch64_cuda_compiler_version11.2.yaml create mode 100644 dev/tasks/conda-recipes/.ci_support/linux_ppc64le_cuda_compiler_version11.2.yaml create mode 100644 dev/tasks/conda-recipes/.ci_support/r/linux_64_r_base4.1.yaml create mode 100644 dev/tasks/conda-recipes/.ci_support/r/linux_aarch64_r_base4.1.yaml create mode 100644 dev/tasks/conda-recipes/.ci_support/r/osx_64_r_base4.1.yaml create mode 100644 dev/tasks/conda-recipes/.ci_support/r/osx_arm64_r_base4.1.yaml rename dev/tasks/conda-recipes/.ci_support/{win_64_cuda_compiler_version10.2.yaml => win_64_cuda_compiler_version11.2.yaml} (78%) rename dev/tasks/conda-recipes/arrow-cpp/{bld-arrow.bat => build-arrow.bat} (89%) rename dev/tasks/conda-recipes/arrow-cpp/{bld-pyarrow.bat => build-pyarrow.bat} (54%) diff --git a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version10.2.yaml b/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version11.2.yaml similarity index 79% rename from dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version10.2.yaml rename to dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version11.2.yaml index 5d80a17c4dfd7..1cdcec199e7ba 100644 --- a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version10.2.yaml +++ b/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version11.2.yaml @@ -1,17 +1,15 @@ aws_crt_cpp: -- 0.19.8 +- 0.20.3 aws_sdk_cpp: - 1.10.57 bzip2: - '1' -c_ares: -- '1' c_compiler: - gcc c_compiler_version: -- '7' +- '10' cdt_name: -- cos6 +- cos7 channel_sources: - conda-forge channel_targets: @@ -19,38 +17,40 @@ channel_targets: cuda_compiler: - nvcc cuda_compiler_version: -- '10.2' +- '11.2' cuda_compiler_version_min: -- '10.2' +- '11.2' cxx_compiler: - gxx cxx_compiler_version: -- '7' +- '10' docker_image: -- quay.io/condaforge/linux-anvil-cos7-cuda:10.2 +- quay.io/condaforge/linux-anvil-cuda:11.2 gflags: - '2.2' glog: - '0.6' google_cloud_cpp: -- 2.8.0 +- '2.12' libabseil: - '20230125' libgrpc: -- '1.52' +- '1.54' +- '1.56' libprotobuf: - '3.21' +- 4.23.3 lz4_c: - 1.9.3 numpy: - '1.21' - '1.23' -- '1.20' -- '1.20' +- '1.21' +- '1.21' openssl: - '3' orc: -- 1.8.3 +- 1.9.0 pin_run_as_build: python: min_pin: x.x @@ -61,7 +61,7 @@ python: - 3.8.* *_cpython - 3.9.* *_cpython re2: -- 2023.02.02 +- 2023.03.02 snappy: - '1' target_platform: @@ -73,9 +73,12 @@ ucx: zip_keys: - - c_compiler_version - cxx_compiler_version + - cuda_compiler - cuda_compiler_version - cdt_name - docker_image +- - libgrpc + - libprotobuf - - python - numpy zlib: diff --git a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNone.yaml b/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNone.yaml index 39b25b44690d7..5be5b58a73932 100644 --- a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNone.yaml +++ b/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNone.yaml @@ -1,15 +1,13 @@ aws_crt_cpp: -- 0.19.8 +- 0.20.3 aws_sdk_cpp: - 1.10.57 bzip2: - '1' -c_ares: -- '1' c_compiler: - gcc c_compiler_version: -- '11' +- '12' cdt_name: - cos6 channel_sources: @@ -17,15 +15,15 @@ channel_sources: channel_targets: - conda-forge main cuda_compiler: -- nvcc +- None cuda_compiler_version: - None cuda_compiler_version_min: -- '10.2' +- '11.2' cxx_compiler: - gxx cxx_compiler_version: -- '11' +- '12' docker_image: - quay.io/condaforge/linux-anvil-cos7-x86_64 gflags: @@ -33,24 +31,26 @@ gflags: glog: - '0.6' google_cloud_cpp: -- 2.8.0 +- '2.12' libabseil: - '20230125' libgrpc: -- '1.52' +- '1.54' +- '1.56' libprotobuf: - '3.21' +- 4.23.3 lz4_c: - 1.9.3 numpy: - '1.21' - '1.23' -- '1.20' -- '1.20' +- '1.21' +- '1.21' openssl: - '3' orc: -- 1.8.3 +- 1.9.0 pin_run_as_build: python: min_pin: x.x @@ -61,7 +61,7 @@ python: - 3.8.* *_cpython - 3.9.* *_cpython re2: -- 2023.02.02 +- 2023.03.02 snappy: - '1' target_platform: @@ -73,9 +73,12 @@ ucx: zip_keys: - - c_compiler_version - cxx_compiler_version + - cuda_compiler - cuda_compiler_version - cdt_name - docker_image +- - libgrpc + - libprotobuf - - python - numpy zlib: diff --git a/dev/tasks/conda-recipes/.ci_support/linux_aarch64_cuda_compiler_version11.2.yaml b/dev/tasks/conda-recipes/.ci_support/linux_aarch64_cuda_compiler_version11.2.yaml new file mode 100644 index 0000000000000..1677b03564c08 --- /dev/null +++ b/dev/tasks/conda-recipes/.ci_support/linux_aarch64_cuda_compiler_version11.2.yaml @@ -0,0 +1,91 @@ +BUILD: +- aarch64-conda_cos7-linux-gnu +aws_crt_cpp: +- 0.20.3 +aws_sdk_cpp: +- 1.10.57 +bzip2: +- '1' +c_compiler: +- gcc +c_compiler_version: +- '10' +cdt_arch: +- aarch64 +cdt_name: +- cos7 +channel_sources: +- conda-forge +channel_targets: +- conda-forge main +cuda_compiler: +- nvcc +cuda_compiler_version: +- '11.2' +cuda_compiler_version_min: +- '11.2' +cxx_compiler: +- gxx +cxx_compiler_version: +- '10' +docker_image: +- quay.io/condaforge/linux-anvil-cuda:11.2 +gflags: +- '2.2' +glog: +- '0.6' +google_cloud_cpp: +- '2.12' +libabseil: +- '20230125' +libgrpc: +- '1.54' +- '1.56' +libprotobuf: +- '3.21' +- 4.23.3 +lz4_c: +- 1.9.3 +numpy: +- '1.21' +- '1.23' +- '1.21' +- '1.21' +openssl: +- '3' +orc: +- 1.9.0 +pin_run_as_build: + python: + min_pin: x.x + max_pin: x.x +python: +- 3.10.* *_cpython +- 3.11.* *_cpython +- 3.8.* *_cpython +- 3.9.* *_cpython +re2: +- 2023.03.02 +snappy: +- '1' +target_platform: +- linux-aarch64 +thrift_cpp: +- 0.18.1 +ucx: +- 1.14.0 +zip_keys: +- - c_compiler_version + - cxx_compiler_version + - cuda_compiler + - cuda_compiler_version + - cdt_name + - docker_image +- - libgrpc + - libprotobuf +- - python + - numpy +zlib: +- '1.2' +zstd: +- '1.5' diff --git a/dev/tasks/conda-recipes/.ci_support/linux_aarch64_cuda_compiler_versionNone.yaml b/dev/tasks/conda-recipes/.ci_support/linux_aarch64_cuda_compiler_versionNone.yaml index af0fc2dcd255e..88fdf1254e661 100644 --- a/dev/tasks/conda-recipes/.ci_support/linux_aarch64_cuda_compiler_versionNone.yaml +++ b/dev/tasks/conda-recipes/.ci_support/linux_aarch64_cuda_compiler_versionNone.yaml @@ -1,17 +1,15 @@ BUILD: - aarch64-conda_cos7-linux-gnu aws_crt_cpp: -- 0.19.8 +- 0.20.3 aws_sdk_cpp: - 1.10.57 bzip2: - '1' -c_ares: -- '1' c_compiler: - gcc c_compiler_version: -- '11' +- '12' cdt_arch: - aarch64 cdt_name: @@ -20,12 +18,16 @@ channel_sources: - conda-forge channel_targets: - conda-forge main +cuda_compiler: +- None cuda_compiler_version: - None +cuda_compiler_version_min: +- '11.2' cxx_compiler: - gxx cxx_compiler_version: -- '11' +- '12' docker_image: - quay.io/condaforge/linux-anvil-cos7-x86_64 gflags: @@ -33,24 +35,26 @@ gflags: glog: - '0.6' google_cloud_cpp: -- 2.8.0 +- '2.12' libabseil: - '20230125' libgrpc: -- '1.52' +- '1.54' +- '1.56' libprotobuf: - '3.21' +- 4.23.3 lz4_c: - 1.9.3 numpy: - '1.21' - '1.23' -- '1.20' -- '1.20' +- '1.21' +- '1.21' openssl: - '3' orc: -- 1.8.3 +- 1.9.0 pin_run_as_build: python: min_pin: x.x @@ -61,7 +65,7 @@ python: - 3.8.* *_cpython - 3.9.* *_cpython re2: -- 2023.02.02 +- 2023.03.02 snappy: - '1' target_platform: @@ -73,9 +77,12 @@ ucx: zip_keys: - - c_compiler_version - cxx_compiler_version + - cuda_compiler - cuda_compiler_version - cdt_name - docker_image +- - libgrpc + - libprotobuf - - python - numpy zlib: diff --git a/dev/tasks/conda-recipes/.ci_support/linux_ppc64le_cuda_compiler_version11.2.yaml b/dev/tasks/conda-recipes/.ci_support/linux_ppc64le_cuda_compiler_version11.2.yaml new file mode 100644 index 0000000000000..3585db7b99baa --- /dev/null +++ b/dev/tasks/conda-recipes/.ci_support/linux_ppc64le_cuda_compiler_version11.2.yaml @@ -0,0 +1,87 @@ +aws_crt_cpp: +- 0.20.3 +aws_sdk_cpp: +- 1.10.57 +bzip2: +- '1' +c_compiler: +- gcc +c_compiler_version: +- '10' +cdt_name: +- cos7 +channel_sources: +- conda-forge +channel_targets: +- conda-forge main +cuda_compiler: +- nvcc +cuda_compiler_version: +- '11.2' +cuda_compiler_version_min: +- '11.2' +cxx_compiler: +- gxx +cxx_compiler_version: +- '10' +docker_image: +- quay.io/condaforge/linux-anvil-cuda:11.2 +gflags: +- '2.2' +glog: +- '0.6' +google_cloud_cpp: +- '2.12' +libabseil: +- '20230125' +libgrpc: +- '1.54' +- '1.56' +libprotobuf: +- '3.21' +- 4.23.3 +lz4_c: +- 1.9.3 +numpy: +- '1.21' +- '1.23' +- '1.21' +- '1.21' +openssl: +- '3' +orc: +- 1.9.0 +pin_run_as_build: + python: + min_pin: x.x + max_pin: x.x +python: +- 3.10.* *_cpython +- 3.11.* *_cpython +- 3.8.* *_cpython +- 3.9.* *_cpython +re2: +- 2023.03.02 +snappy: +- '1' +target_platform: +- linux-ppc64le +thrift_cpp: +- 0.18.1 +ucx: +- 1.14.0 +zip_keys: +- - c_compiler_version + - cxx_compiler_version + - cuda_compiler + - cuda_compiler_version + - cdt_name + - docker_image +- - libgrpc + - libprotobuf +- - python + - numpy +zlib: +- '1.2' +zstd: +- '1.5' diff --git a/dev/tasks/conda-recipes/.ci_support/linux_ppc64le_cuda_compiler_versionNone.yaml b/dev/tasks/conda-recipes/.ci_support/linux_ppc64le_cuda_compiler_versionNone.yaml index 83a1f7f740092..c13a522254286 100644 --- a/dev/tasks/conda-recipes/.ci_support/linux_ppc64le_cuda_compiler_versionNone.yaml +++ b/dev/tasks/conda-recipes/.ci_support/linux_ppc64le_cuda_compiler_versionNone.yaml @@ -1,27 +1,29 @@ aws_crt_cpp: -- 0.19.8 +- 0.20.3 aws_sdk_cpp: - 1.10.57 bzip2: - '1' -c_ares: -- '1' c_compiler: - gcc c_compiler_version: -- '11' +- '12' cdt_name: - cos7 channel_sources: - conda-forge channel_targets: - conda-forge main +cuda_compiler: +- None cuda_compiler_version: - None +cuda_compiler_version_min: +- '11.2' cxx_compiler: - gxx cxx_compiler_version: -- '11' +- '12' docker_image: - quay.io/condaforge/linux-anvil-cos7-x86_64 gflags: @@ -29,24 +31,26 @@ gflags: glog: - '0.6' google_cloud_cpp: -- 2.8.0 +- '2.12' libabseil: - '20230125' libgrpc: -- '1.52' +- '1.54' +- '1.56' libprotobuf: - '3.21' +- 4.23.3 lz4_c: - 1.9.3 numpy: - '1.21' - '1.23' -- '1.20' -- '1.20' +- '1.21' +- '1.21' openssl: - '3' orc: -- 1.8.3 +- 1.9.0 pin_run_as_build: python: min_pin: x.x @@ -57,7 +61,7 @@ python: - 3.8.* *_cpython - 3.9.* *_cpython re2: -- 2023.02.02 +- 2023.03.02 snappy: - '1' target_platform: @@ -69,9 +73,12 @@ ucx: zip_keys: - - c_compiler_version - cxx_compiler_version + - cuda_compiler - cuda_compiler_version - cdt_name - docker_image +- - libgrpc + - libprotobuf - - python - numpy zlib: diff --git a/dev/tasks/conda-recipes/.ci_support/osx_64_.yaml b/dev/tasks/conda-recipes/.ci_support/osx_64_.yaml index 0cf990cc113f2..dd4a230760ef2 100644 --- a/dev/tasks/conda-recipes/.ci_support/osx_64_.yaml +++ b/dev/tasks/conda-recipes/.ci_support/osx_64_.yaml @@ -1,17 +1,15 @@ MACOSX_DEPLOYMENT_TARGET: - '10.9' aws_crt_cpp: -- 0.19.8 +- 0.20.3 aws_sdk_cpp: - 1.10.57 bzip2: - '1' -c_ares: -- '1' c_compiler: - clang c_compiler_version: -- '14' +- '15' channel_sources: - conda-forge channel_targets: @@ -21,19 +19,21 @@ cuda_compiler_version: cxx_compiler: - clangxx cxx_compiler_version: -- '14' +- '15' gflags: - '2.2' glog: - '0.6' google_cloud_cpp: -- 2.8.0 +- '2.12' libabseil: - '20230125' libgrpc: -- '1.52' +- '1.54' +- '1.56' libprotobuf: - '3.21' +- 4.23.3 lz4_c: - 1.9.3 macos_machine: @@ -41,12 +41,12 @@ macos_machine: numpy: - '1.21' - '1.23' -- '1.20' -- '1.20' +- '1.21' +- '1.21' openssl: - '3' orc: -- 1.8.3 +- 1.9.0 pin_run_as_build: python: min_pin: x.x @@ -57,7 +57,7 @@ python: - 3.8.* *_cpython - 3.9.* *_cpython re2: -- 2023.02.02 +- 2023.03.02 snappy: - '1' target_platform: @@ -67,6 +67,8 @@ thrift_cpp: zip_keys: - - c_compiler_version - cxx_compiler_version +- - libgrpc + - libprotobuf - - python - numpy zlib: diff --git a/dev/tasks/conda-recipes/.ci_support/osx_arm64_.yaml b/dev/tasks/conda-recipes/.ci_support/osx_arm64_.yaml index 3faa6278e81e2..6a6713a54fe86 100644 --- a/dev/tasks/conda-recipes/.ci_support/osx_arm64_.yaml +++ b/dev/tasks/conda-recipes/.ci_support/osx_arm64_.yaml @@ -1,17 +1,15 @@ MACOSX_DEPLOYMENT_TARGET: - '11.0' aws_crt_cpp: -- 0.19.8 +- 0.20.3 aws_sdk_cpp: - 1.10.57 bzip2: - '1' -c_ares: -- '1' c_compiler: - clang c_compiler_version: -- '14' +- '15' channel_sources: - conda-forge channel_targets: @@ -21,19 +19,21 @@ cuda_compiler_version: cxx_compiler: - clangxx cxx_compiler_version: -- '14' +- '15' gflags: - '2.2' glog: - '0.6' google_cloud_cpp: -- 2.8.0 +- '2.12' libabseil: - '20230125' libgrpc: -- '1.52' +- '1.54' +- '1.56' libprotobuf: - '3.21' +- 4.23.3 lz4_c: - 1.9.3 macos_machine: @@ -41,12 +41,12 @@ macos_machine: numpy: - '1.21' - '1.23' -- '1.20' -- '1.20' +- '1.21' +- '1.21' openssl: - '3' orc: -- 1.8.3 +- 1.9.0 pin_run_as_build: python: min_pin: x.x @@ -57,7 +57,7 @@ python: - 3.8.* *_cpython - 3.9.* *_cpython re2: -- 2023.02.02 +- 2023.03.02 snappy: - '1' target_platform: @@ -67,6 +67,8 @@ thrift_cpp: zip_keys: - - c_compiler_version - cxx_compiler_version +- - libgrpc + - libprotobuf - - python - numpy zlib: diff --git a/dev/tasks/conda-recipes/.ci_support/r/linux_64_r_base4.1.yaml b/dev/tasks/conda-recipes/.ci_support/r/linux_64_r_base4.1.yaml new file mode 100644 index 0000000000000..e63767cbe9771 --- /dev/null +++ b/dev/tasks/conda-recipes/.ci_support/r/linux_64_r_base4.1.yaml @@ -0,0 +1,27 @@ +c_compiler: +- gcc +c_compiler_version: +- '12' +cdt_name: +- cos6 +channel_sources: +- conda-forge +channel_targets: +- conda-forge main +cxx_compiler: +- gxx +cxx_compiler_version: +- '12' +docker_image: +- quay.io/condaforge/linux-anvil-cos7-x86_64 +pin_run_as_build: + r-base: + min_pin: x.x + max_pin: x.x +r_base: +- '4.1' +target_platform: +- linux-64 +zip_keys: +- - c_compiler_version + - cxx_compiler_version diff --git a/dev/tasks/conda-recipes/.ci_support/r/linux_64_r_base4.2.yaml b/dev/tasks/conda-recipes/.ci_support/r/linux_64_r_base4.2.yaml index 38753baa7ed09..6e661e1357d22 100644 --- a/dev/tasks/conda-recipes/.ci_support/r/linux_64_r_base4.2.yaml +++ b/dev/tasks/conda-recipes/.ci_support/r/linux_64_r_base4.2.yaml @@ -1,7 +1,7 @@ c_compiler: - gcc c_compiler_version: -- '11' +- '12' cdt_name: - cos6 channel_sources: @@ -11,7 +11,7 @@ channel_targets: cxx_compiler: - gxx cxx_compiler_version: -- '11' +- '12' docker_image: - quay.io/condaforge/linux-anvil-cos7-x86_64 pin_run_as_build: diff --git a/dev/tasks/conda-recipes/.ci_support/r/linux_aarch64_r_base4.1.yaml b/dev/tasks/conda-recipes/.ci_support/r/linux_aarch64_r_base4.1.yaml new file mode 100644 index 0000000000000..2b80b020fdc0b --- /dev/null +++ b/dev/tasks/conda-recipes/.ci_support/r/linux_aarch64_r_base4.1.yaml @@ -0,0 +1,31 @@ +BUILD: +- aarch64-conda_cos7-linux-gnu +c_compiler: +- gcc +c_compiler_version: +- '12' +cdt_arch: +- aarch64 +cdt_name: +- cos7 +channel_sources: +- conda-forge +channel_targets: +- conda-forge main +cxx_compiler: +- gxx +cxx_compiler_version: +- '12' +docker_image: +- quay.io/condaforge/linux-anvil-cos7-x86_64 +pin_run_as_build: + r-base: + min_pin: x.x + max_pin: x.x +r_base: +- '4.1' +target_platform: +- linux-aarch64 +zip_keys: +- - c_compiler_version + - cxx_compiler_version diff --git a/dev/tasks/conda-recipes/.ci_support/r/linux_aarch64_r_base4.2.yaml b/dev/tasks/conda-recipes/.ci_support/r/linux_aarch64_r_base4.2.yaml index 2913bbb4f141f..9dcd0c34c851c 100644 --- a/dev/tasks/conda-recipes/.ci_support/r/linux_aarch64_r_base4.2.yaml +++ b/dev/tasks/conda-recipes/.ci_support/r/linux_aarch64_r_base4.2.yaml @@ -3,7 +3,7 @@ BUILD: c_compiler: - gcc c_compiler_version: -- '11' +- '12' cdt_arch: - aarch64 cdt_name: @@ -15,7 +15,7 @@ channel_targets: cxx_compiler: - gxx cxx_compiler_version: -- '11' +- '12' docker_image: - quay.io/condaforge/linux-anvil-cos7-x86_64 pin_run_as_build: diff --git a/dev/tasks/conda-recipes/.ci_support/r/osx_64_r_base4.1.yaml b/dev/tasks/conda-recipes/.ci_support/r/osx_64_r_base4.1.yaml new file mode 100644 index 0000000000000..6be6c2f5462c5 --- /dev/null +++ b/dev/tasks/conda-recipes/.ci_support/r/osx_64_r_base4.1.yaml @@ -0,0 +1,27 @@ +MACOSX_DEPLOYMENT_TARGET: +- '10.9' +c_compiler: +- clang +c_compiler_version: +- '15' +channel_sources: +- conda-forge +channel_targets: +- conda-forge main +cxx_compiler: +- clangxx +cxx_compiler_version: +- '15' +macos_machine: +- x86_64-apple-darwin13.4.0 +pin_run_as_build: + r-base: + min_pin: x.x + max_pin: x.x +r_base: +- '4.1' +target_platform: +- osx-64 +zip_keys: +- - c_compiler_version + - cxx_compiler_version diff --git a/dev/tasks/conda-recipes/.ci_support/r/osx_64_r_base4.2.yaml b/dev/tasks/conda-recipes/.ci_support/r/osx_64_r_base4.2.yaml index 25437ee4adcfe..2116eaf7b8b21 100644 --- a/dev/tasks/conda-recipes/.ci_support/r/osx_64_r_base4.2.yaml +++ b/dev/tasks/conda-recipes/.ci_support/r/osx_64_r_base4.2.yaml @@ -3,7 +3,7 @@ MACOSX_DEPLOYMENT_TARGET: c_compiler: - clang c_compiler_version: -- '14' +- '15' channel_sources: - conda-forge channel_targets: @@ -11,7 +11,7 @@ channel_targets: cxx_compiler: - clangxx cxx_compiler_version: -- '14' +- '15' macos_machine: - x86_64-apple-darwin13.4.0 pin_run_as_build: diff --git a/dev/tasks/conda-recipes/.ci_support/r/osx_arm64_r_base4.1.yaml b/dev/tasks/conda-recipes/.ci_support/r/osx_arm64_r_base4.1.yaml new file mode 100644 index 0000000000000..0ce856fcccf5c --- /dev/null +++ b/dev/tasks/conda-recipes/.ci_support/r/osx_arm64_r_base4.1.yaml @@ -0,0 +1,27 @@ +MACOSX_DEPLOYMENT_TARGET: +- '11.0' +c_compiler: +- clang +c_compiler_version: +- '15' +channel_sources: +- conda-forge +channel_targets: +- conda-forge main +cxx_compiler: +- clangxx +cxx_compiler_version: +- '15' +macos_machine: +- arm64-apple-darwin20.0.0 +pin_run_as_build: + r-base: + min_pin: x.x + max_pin: x.x +r_base: +- '4.1' +target_platform: +- osx-arm64 +zip_keys: +- - c_compiler_version + - cxx_compiler_version diff --git a/dev/tasks/conda-recipes/.ci_support/r/osx_arm64_r_base4.2.yaml b/dev/tasks/conda-recipes/.ci_support/r/osx_arm64_r_base4.2.yaml index 1557b23ff96af..af8a07c42208e 100644 --- a/dev/tasks/conda-recipes/.ci_support/r/osx_arm64_r_base4.2.yaml +++ b/dev/tasks/conda-recipes/.ci_support/r/osx_arm64_r_base4.2.yaml @@ -3,7 +3,7 @@ MACOSX_DEPLOYMENT_TARGET: c_compiler: - clang c_compiler_version: -- '14' +- '15' channel_sources: - conda-forge channel_targets: @@ -11,7 +11,7 @@ channel_targets: cxx_compiler: - clangxx cxx_compiler_version: -- '14' +- '15' macos_machine: - arm64-apple-darwin20.0.0 pin_run_as_build: diff --git a/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_version10.2.yaml b/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_version11.2.yaml similarity index 78% rename from dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_version10.2.yaml rename to dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_version11.2.yaml index 6ea00e3bd0d3f..f75d92e276d9e 100644 --- a/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_version10.2.yaml +++ b/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_version11.2.yaml @@ -1,3 +1,5 @@ +aws_crt_cpp: +- 0.20.3 aws_sdk_cpp: - 1.10.57 bzip2: @@ -13,9 +15,9 @@ channel_targets: cuda_compiler: - nvcc cuda_compiler_version: -- '10.2' +- '11.2' cuda_compiler_version_min: -- '10.2' +- '11.2' cxx_compiler: - vs2019 gflags: @@ -23,28 +25,30 @@ gflags: glog: - '0.6' google_cloud_cpp: -- 2.8.0 +- '2.12' libabseil: - '20230125' libcrc32c: - '1.1' libcurl: -- '7' +- '8' libgrpc: -- '1.52' +- '1.54' +- '1.56' libprotobuf: - '3.21' +- 4.23.3 lz4_c: - 1.9.3 numpy: - '1.21' - '1.23' -- '1.20' -- '1.20' +- '1.21' +- '1.21' openssl: - '3' orc: -- 1.8.3 +- 1.9.0 pin_run_as_build: python: min_pin: x.x @@ -55,7 +59,7 @@ python: - 3.8.* *_cpython - 3.9.* *_cpython re2: -- 2023.02.02 +- 2023.03.02 snappy: - '1' target_platform: @@ -63,6 +67,10 @@ target_platform: thrift_cpp: - 0.18.1 zip_keys: +- - cuda_compiler + - cuda_compiler_version +- - libgrpc + - libprotobuf - - python - numpy zlib: diff --git a/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNone.yaml b/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNone.yaml index 183356662c648..6d8fb15b15a2a 100644 --- a/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNone.yaml +++ b/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNone.yaml @@ -1,3 +1,5 @@ +aws_crt_cpp: +- 0.20.3 aws_sdk_cpp: - 1.10.57 bzip2: @@ -11,11 +13,11 @@ channel_sources: channel_targets: - conda-forge main cuda_compiler: -- nvcc +- None cuda_compiler_version: - None cuda_compiler_version_min: -- '10.2' +- '11.2' cxx_compiler: - vs2019 gflags: @@ -23,28 +25,30 @@ gflags: glog: - '0.6' google_cloud_cpp: -- 2.8.0 +- '2.12' libabseil: - '20230125' libcrc32c: - '1.1' libcurl: -- '7' +- '8' libgrpc: -- '1.52' +- '1.54' +- '1.56' libprotobuf: - '3.21' +- 4.23.3 lz4_c: - 1.9.3 numpy: - '1.21' - '1.23' -- '1.20' -- '1.20' +- '1.21' +- '1.21' openssl: - '3' orc: -- 1.8.3 +- 1.9.0 pin_run_as_build: python: min_pin: x.x @@ -55,7 +59,7 @@ python: - 3.8.* *_cpython - 3.9.* *_cpython re2: -- 2023.02.02 +- 2023.03.02 snappy: - '1' target_platform: @@ -63,6 +67,10 @@ target_platform: thrift_cpp: - 0.18.1 zip_keys: +- - cuda_compiler + - cuda_compiler_version +- - libgrpc + - libprotobuf - - python - numpy zlib: diff --git a/dev/tasks/conda-recipes/arrow-cpp/activate.sh b/dev/tasks/conda-recipes/arrow-cpp/activate.sh index 90210fac0a034..8757612781bbe 100644 --- a/dev/tasks/conda-recipes/arrow-cpp/activate.sh +++ b/dev/tasks/conda-recipes/arrow-cpp/activate.sh @@ -7,24 +7,70 @@ # doesn't come with a deactivate script, because the symlink # is benign and doesn't need to be deleted. -# where the GDB wrappers get installed -GDB_PREFIX=$CONDA_PREFIX/share/gdb/auto-load +_la_log() { + if [ "${CF_LIBARROW_ACTIVATE_LOGGING:-}" = "1" ]; then + # The following loop is necessary to handle multi-line strings + # like for the output of `ls -al`. + printf '%s\n' "$*" | while IFS= read -r line + do + echo "$CONDA_PREFIX/etc/conda/activate.d/libarrow_activate.sh DEBUG: $line" + done + fi +} + +_la_log "Beginning libarrow activation." -# If the directory is not writable, nothing can be done -if [ ! -w $GDB_PREFIX ]; then - return -fi +# where the GDB wrappers get installed +_la_gdb_prefix="$CONDA_PREFIX/share/gdb/auto-load" -# this needs to be in sync with the respective patch -PLACEHOLDER=replace_this_section_with_absolute_slashed_path_to_CONDA_PREFIX +# this needs to be in sync with ARROW_GDB_INSTALL_DIR in build.sh +_la_placeholder="replace_this_section_with_absolute_slashed_path_to_CONDA_PREFIX" # the paths here are intentionally stacked, see #935, resp. # https://github.com/apache/arrow/blob/master/docs/source/cpp/gdb.rst#manual-loading -WRAPPER_DIR=$GDB_PREFIX/$CONDA_PREFIX/lib +_la_symlink_dir="$_la_gdb_prefix/$CONDA_PREFIX/lib" +_la_orig_install_dir="$_la_gdb_prefix/$_la_placeholder/lib" -mkdir -p $WRAPPER_DIR -# there's only one lib in that folder, but the libname changes +_la_log " _la_gdb_prefix: $_la_gdb_prefix" +_la_log " _la_placeholder: $_la_placeholder" +_la_log " _la_symlink_dir: $_la_symlink_dir" +_la_log " _la_orig_install_dir: $_la_orig_install_dir" +_la_log " content of that folder:" +_la_log "$(ls -al "$_la_orig_install_dir" | sed 's/^/ /')" + +# there's only one lib in the _la_orig_install_dir folder, but the libname changes # based on the version so use a loop instead of hardcoding it. -for f in $GDB_PREFIX/$PLACEHOLDER/lib/*.py; do - # overwrite, because we don't have deactivation (i.e. symlink remains) - ln -sf $f $WRAPPER_DIR/$(basename $f) +for _la_target in "$_la_orig_install_dir/"*.py; do + if [ ! -e "$_la_target" ]; then + # If the file doesn't exist, skip this iteration of the loop. + # (This happens when no files are found, in which case the + # loop runs with target equal to the pattern itself.) + _la_log 'Folder $_la_orig_install_dir seems to not contain .py files, skipping' + continue + fi + _la_symlink="$_la_symlink_dir/$(basename "$_la_target")" + _la_log " _la_target: $_la_target" + _la_log " _la_symlink: $_la_symlink" + if [ -L "$_la_symlink" ] && [ "$(readlink "$_la_symlink")" = "$_la_target" ]; then + _la_log 'symlink $_la_symlink already exists and points to $_la_target, skipping.' + continue + fi + _la_log 'Creating symlink $_la_symlink pointing to $_la_target' + mkdir -p "$_la_symlink_dir" || true + # this check also creates the symlink; if it fails, we enter the if-branch. + if ! ln -sf "$_la_target" "$_la_symlink"; then + echo -n "${BASH_SOURCE[0]} ERROR: Failed to create symlink from " + echo -n "'$_la_target' to '$_la_symlink'" + echo + continue + fi done + +_la_log "Libarrow activation complete." + +unset _la_gdb_prefix +unset _la_log +unset _la_orig_install_dir +unset _la_placeholder +unset _la_symlink +unset _la_symlink_dir +unset _la_target diff --git a/dev/tasks/conda-recipes/arrow-cpp/bld-arrow.bat b/dev/tasks/conda-recipes/arrow-cpp/build-arrow.bat similarity index 89% rename from dev/tasks/conda-recipes/arrow-cpp/bld-arrow.bat rename to dev/tasks/conda-recipes/arrow-cpp/build-arrow.bat index 60c81be741128..1268771643d4f 100644 --- a/dev/tasks/conda-recipes/arrow-cpp/bld-arrow.bat +++ b/dev/tasks/conda-recipes/arrow-cpp/build-arrow.bat @@ -1,16 +1,12 @@ @echo on -mkdir "%SRC_DIR%"\cpp\build -pushd "%SRC_DIR%"\cpp\build +mkdir cpp\build +pushd cpp\build :: Enable CUDA support if "%cuda_compiler_version%"=="None" ( set "EXTRA_CMAKE_ARGS=-DARROW_CUDA=OFF" ) else ( - REM this should move to nvcc-feedstock - set "CUDA_PATH=%CUDA_PATH:\=/%" - set "CUDA_HOME=%CUDA_HOME:\=/%" - set "EXTRA_CMAKE_ARGS=-DARROW_CUDA=ON" ) @@ -18,8 +14,9 @@ if "%cuda_compiler_version%"=="None" ( set "READ_RECIPE_META_YAML_WHY_NOT=OFF" :: for available switches see -:: https://github.com/apache/arrow/blame/apache-arrow-11.0.0/cpp/cmake_modules/DefineOptions.cmake +:: https://github.com/apache/arrow/blame/apache-arrow-12.0.0/cpp/cmake_modules/DefineOptions.cmake cmake -G "Ninja" ^ + -DARROW_ACERO=ON ^ -DARROW_BOOST_USE_SHARED:BOOL=ON ^ -DARROW_BUILD_STATIC:BOOL=OFF ^ -DARROW_BUILD_TESTS:BOOL=OFF ^ @@ -69,3 +66,6 @@ cmake --build . --target install --config Release if %ERRORLEVEL% neq 0 exit 1 popd + +:: clean up between builds (and to save space) +rmdir /s /q cpp\build diff --git a/dev/tasks/conda-recipes/arrow-cpp/build-arrow.sh b/dev/tasks/conda-recipes/arrow-cpp/build-arrow.sh index fb8cbade86568..dc588f9473870 100755 --- a/dev/tasks/conda-recipes/arrow-cpp/build-arrow.sh +++ b/dev/tasks/conda-recipes/arrow-cpp/build-arrow.sh @@ -30,27 +30,21 @@ fi # Enable CUDA support if [[ ! -z "${cuda_compiler_version+x}" && "${cuda_compiler_version}" != "None" ]] then - if [[ -z "${CUDA_HOME+x}" ]] - then - echo "cuda_compiler_version=${cuda_compiler_version} CUDA_HOME=$CUDA_HOME" - CUDA_GDB_EXECUTABLE=$(which cuda-gdb || exit 0) - if [[ -n "$CUDA_GDB_EXECUTABLE" ]] - then - CUDA_HOME=$(dirname $(dirname $CUDA_GDB_EXECUTABLE)) - else - echo "Cannot determine CUDA_HOME: cuda-gdb not in PATH" - return 1 - fi - fi - EXTRA_CMAKE_ARGS=" ${EXTRA_CMAKE_ARGS} -DARROW_CUDA=ON -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_HOME} -DCMAKE_LIBRARY_PATH=${CUDA_HOME}/lib64/stubs" + EXTRA_CMAKE_ARGS=" ${EXTRA_CMAKE_ARGS} -DARROW_CUDA=ON -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_HOME} -DCMAKE_LIBRARY_PATH=${CONDA_BUILD_SYSROOT}/lib" else EXTRA_CMAKE_ARGS=" ${EXTRA_CMAKE_ARGS} -DARROW_CUDA=OFF" fi -if [[ "${target_platform}" == "osx-arm64" ]]; then - EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DCLANG_EXECUTABLE=${BUILD_PREFIX}/bin/clang -DLLVM_LINK_EXECUTABLE=${BUILD_PREFIX}/bin/llvm-link" +if [[ "${build_platform}" != "${target_platform}" ]]; then + # point to a usable protoc/grpc_cpp_plugin if we're cross-compiling + EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DProtobuf_PROTOC_EXECUTABLE=$BUILD_PREFIX/bin/protoc" + if [[ ! -f ${BUILD_PREFIX}/bin/${CONDA_TOOLCHAIN_HOST}-clang ]]; then + ln -sf ${BUILD_PREFIX}/bin/clang ${BUILD_PREFIX}/bin/${CONDA_TOOLCHAIN_HOST}-clang + fi + EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DCLANG_EXECUTABLE=${BUILD_PREFIX}/bin/${CONDA_TOOLCHAIN_HOST}-clang" + EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DLLVM_LINK_EXECUTABLE=${BUILD_PREFIX}/bin/llvm-link" sed -ie "s;protoc-gen-grpc.*$;protoc-gen-grpc=${BUILD_PREFIX}/bin/grpc_cpp_plugin\";g" ../src/arrow/flight/CMakeLists.txt - sed -ie 's;"--with-jemalloc-prefix\=je_arrow_";"--with-jemalloc-prefix\=je_arrow_" "--with-lg-page\=14";g' ../cmake_modules/ThirdpartyToolchain.cmake + sed -ie 's;"--with-jemalloc-prefix\=je_arrow_";"--with-jemalloc-prefix\=je_arrow_" "--with-lg-page\=16";g' ../cmake_modules/ThirdpartyToolchain.cmake fi # disable -fno-plt, which causes problems with GCC on PPC @@ -59,23 +53,19 @@ if [[ "$target_platform" == "linux-ppc64le" ]]; then CXXFLAGS="$(echo $CXXFLAGS | sed 's/-fno-plt //g')" fi -# Limit number of threads used to avoid hardware oversubscription if [[ "${target_platform}" == "linux-aarch64" ]] || [[ "${target_platform}" == "linux-ppc64le" ]]; then - export CMAKE_BUILD_PARALLEL_LEVEL=3 -fi - -# point to a usable protoc if we're running on a different architecture than the target -if [[ "${build_platform}" != "${target_platform}" ]]; then - EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DProtobuf_PROTOC_EXECUTABLE=$BUILD_PREFIX/bin/protoc" + # Limit number of threads used to avoid hardware oversubscription + export CMAKE_BUILD_PARALLEL_LEVEL=3 fi # reusable variable for dependencies we cannot yet unvendor export READ_RECIPE_META_YAML_WHY_NOT=OFF # for available switches see -# https://github.com/apache/arrow/blame/apache-arrow-11.0.0/cpp/cmake_modules/DefineOptions.cmake -# placeholder in ARROW_GDB_INSTALL_DIR must match what's used for replacement in activate.sh +# https://github.com/apache/arrow/blame/apache-arrow-12.0.0/cpp/cmake_modules/DefineOptions.cmake +# placeholder in ARROW_GDB_INSTALL_DIR must match _la_placeholder in activate.sh cmake -GNinja \ + -DARROW_ACERO=ON \ -DARROW_BOOST_USE_SHARED=ON \ -DARROW_BUILD_BENCHMARKS=OFF \ -DARROW_BUILD_STATIC=OFF \ @@ -129,3 +119,6 @@ cmake -GNinja \ cmake --build . --target install --config Release popd + +# clean up between builds (and to save space) +rm -rf cpp/build diff --git a/dev/tasks/conda-recipes/arrow-cpp/bld-pyarrow.bat b/dev/tasks/conda-recipes/arrow-cpp/build-pyarrow.bat similarity index 54% rename from dev/tasks/conda-recipes/arrow-cpp/bld-pyarrow.bat rename to dev/tasks/conda-recipes/arrow-cpp/build-pyarrow.bat index 084faf74e4a10..e3eaa32bcf848 100644 --- a/dev/tasks/conda-recipes/arrow-cpp/bld-pyarrow.bat +++ b/dev/tasks/conda-recipes/arrow-cpp/build-pyarrow.bat @@ -2,22 +2,10 @@ pushd "%SRC_DIR%"\python -@rem the symlinks for cmake modules don't work here -@rem NOTE: In contrast to conda-forge, they work here as we clone from git. -@rem del cmake_modules\BuildUtils.cmake -@rem del cmake_modules\SetupCxxFlags.cmake -@rem del cmake_modules\CompilerInfo.cmake -@rem del cmake_modules\FindNumPy.cmake -@rem del cmake_modules\FindPythonLibsNew.cmake -@rem copy /Y "%SRC_DIR%\cpp\cmake_modules\BuildUtils.cmake" cmake_modules\ -@rem copy /Y "%SRC_DIR%\cpp\cmake_modules\SetupCxxFlags.cmake" cmake_modules\ -@rem copy /Y "%SRC_DIR%\cpp\cmake_modules\CompilerInfo.cmake" cmake_modules\ -@rem copy /Y "%SRC_DIR%\cpp\cmake_modules\FindNumPy.cmake" cmake_modules\ -@rem copy /Y "%SRC_DIR%\cpp\cmake_modules\FindPythonLibsNew.cmake" cmake_modules\ - SET ARROW_HOME=%LIBRARY_PREFIX% SET SETUPTOOLS_SCM_PRETEND_VERSION=%PKG_VERSION% SET PYARROW_BUILD_TYPE=release +SET PYARROW_WITH_ACERO=1 SET PYARROW_WITH_DATASET=1 SET PYARROW_WITH_FLIGHT=1 SET PYARROW_WITH_GANDIVA=1 diff --git a/dev/tasks/conda-recipes/arrow-cpp/build-pyarrow.sh b/dev/tasks/conda-recipes/arrow-cpp/build-pyarrow.sh index 14c67ede6324e..9c12321a1c115 100755 --- a/dev/tasks/conda-recipes/arrow-cpp/build-pyarrow.sh +++ b/dev/tasks/conda-recipes/arrow-cpp/build-pyarrow.sh @@ -6,6 +6,7 @@ export ARROW_HOME=$PREFIX export PARQUET_HOME=$PREFIX export SETUPTOOLS_SCM_PRETEND_VERSION=$PKG_VERSION export PYARROW_BUILD_TYPE=release +export PYARROW_WITH_ACERO=1 export PYARROW_WITH_DATASET=1 export PYARROW_WITH_FLIGHT=1 export PYARROW_WITH_GANDIVA=1 @@ -37,9 +38,9 @@ if [[ "${target_platform}" == osx-* ]]; then CXXFLAGS="${CXXFLAGS} -D_LIBCPP_DISABLE_AVAILABILITY" fi -# Limit number of threads used to avoid hardware oversubscription if [[ "${target_platform}" == "linux-aarch64" ]] || [[ "${target_platform}" == "linux-ppc64le" ]]; then - export CMAKE_BUILD_PARALLEL_LEVEL=4 + # Limit number of threads used to avoid hardware oversubscription + export CMAKE_BUILD_PARALLEL_LEVEL=4 fi cd python diff --git a/dev/tasks/conda-recipes/arrow-cpp/meta.yaml b/dev/tasks/conda-recipes/arrow-cpp/meta.yaml index 2f79bbe958c07..e61034c3075b3 100644 --- a/dev/tasks/conda-recipes/arrow-cpp/meta.yaml +++ b/dev/tasks/conda-recipes/arrow-cpp/meta.yaml @@ -4,7 +4,7 @@ {% set build_ext_version = ARROW_VERSION %} {% set build_ext = "cuda" if cuda_enabled else "cpu" %} {% set proc_build_number = "0" %} -{% set llvm_version = "14" %} +{% set llvm_version = "15" %} # see https://github.com/apache/arrow/blob/apache-arrow-10.0.1/cpp/CMakeLists.txt#L88-L90 {% set so_version = (version.split(".")[0] | int * 100 + version.split(".")[1] | int) ~ "." ~ version.split(".")[2] ~ ".0" %} @@ -21,11 +21,9 @@ build: # for cuda support, building with one version is enough to be compatible with # all later versions, since arrow is only using libcuda, and not libcudart. skip: true # [cuda_compiler_version not in ("None", cuda_compiler_version_min)] - # temporary: skip CUDA on aarch/ppc until cross-compilation works, see - # https://github.com/conda-forge/conda-forge-ci-setup-feedstock/pull/210 - skip: true # [(aarch64 or ppc64le) and (cuda_compiler_version != "None")] + # arrow promises API- & ABI-compatibility along SemVer, see #1096 run_exports: - - {{ pin_subpackage("libarrow", max_pin="x.x.x") }} + - {{ pin_subpackage("libarrow", max_pin="x") }} outputs: - name: apache-arrow-proc @@ -57,30 +55,38 @@ outputs: - exit 0 - name: libarrow - script: build-arrow.sh # [not win] - script: bld-arrow.bat # [win] + script: build-arrow.sh # [unix] + script: build-arrow.bat # [win] version: {{ version }} build: string: h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}_{{ build_ext }} run_exports: - {{ pin_subpackage("libarrow", max_pin="x.x.x") }} + ignore_run_exports_from: + - {{ compiler("cuda") }} # [cuda_compiler_version != "None"] + # arrow only uses headers, apparently + - gflags + # shared lib linked on unix, not on win + - glog # [win] ignore_run_exports: - - cudatoolkit + # we don't need all of brotli's run-exports + - libbrotlicommon track_features: {{ "[arrow-cuda]" if cuda_enabled else "" }} missing_dso_whitelist: - - "*/libcuda.so.*" # [linux] - - "*/nvcuda.dll" # [win] + - '*/libcuda.so.*' # [linux] + - '*/nvcuda.dll' # [win] requirements: build: - {{ compiler("c") }} - {{ compiler("cxx") }} - {{ compiler("cuda") }} # [cuda_compiler_version != "None"] - - clangdev {{ llvm_version }} # [osx and arm64] - - llvmdev {{ llvm_version }} # [osx and arm64] - - gnuconfig # [osx and arm64] # needs to run protoc & grpc_cpp_plugin - libgrpc # [build_platform != target_platform] - libprotobuf # [build_platform != target_platform] + # needed for gandiva + - clangdev {{ llvm_version }} # [build_platform != target_platform] + - llvmdev {{ llvm_version }} # [build_platform != target_platform] + - gnuconfig # [build_platform != target_platform] - cmake - ninja # necessary for vendored jemalloc @@ -91,12 +97,11 @@ outputs: # https://github.com/apache/arrow/blob/apache-arrow-11.0.0/cpp/cmake_modules/ThirdpartyToolchain.cmake#L46-L75 - clangdev {{ llvm_version }} - llvmdev {{ llvm_version }} - - aws-crt-cpp # [unix] + - aws-crt-cpp - aws-sdk-cpp - boost-cpp >=1.70 - brotli - bzip2 - - c-ares # not yet: https://github.com/conda-forge/cpp-opentelemetry-sdk-feedstock/issues/38 # - cpp-opentelemetry-sdk # - proto-opentelemetry-proto =={{ cpp_opentelemetry_sdk }} @@ -106,11 +111,6 @@ outputs: # arrow uses a customized jemalloc, see #944 # - jemalloc - libabseil - # since libgoogle-cloud is static on windows, see - # https://github.com/conda-forge/google-cloud-cpp-feedstock/pull/108, - # its dependencies leak into the build here - - libcrc32c # [win] - - libcurl # [win] - libgrpc - libprotobuf - libutf8proc @@ -127,17 +127,26 @@ outputs: - xsimd - zlib - zstd + - __cuda >={{ cuda_compiler_version_min }} # [cuda_compiler_version != "None"] + # since libgoogle-cloud is static on windows, see + # https://github.com/conda-forge/google-cloud-cpp-feedstock/pull/108, + # its host deps (which aren't yet covered above) leak into the build here + - libcrc32c # [win] + - libcurl # [win] + # same for libgrpc (before 1.55.0, which is coupled with libprotobuf 4.23.x) + - c-ares # [win and libprotobuf == "3.21"] run_constrained: - apache-arrow-proc =*={{ build_ext }} - - cudatoolkit >={{ cuda_compiler_version_min }} # [cuda_compiler_version != "None"] # make sure we don't co-install with old version of old package name - arrow-cpp ={{ version }} + # old parquet lib output, now part of this feedstock + - parquet-cpp <0.0a0 test: commands: {% set headers = [ - "arrow/api.h", "arrow/flight/types.h", "arrow/flight/sql/api.h", - "gandiva/engine.h", "parquet/api/reader.h" + "arrow/api.h", "arrow/acero/api.h", "arrow/flight/types.h", + "arrow/flight/sql/api.h", "gandiva/engine.h", "parquet/api/reader.h" ] %} {% for each_header in headers %} # headers @@ -146,8 +155,8 @@ outputs: {% endfor %} {% set libs = (cuda_compiler_version != "None") * ["arrow_cuda"] + [ - "arrow", "arrow_dataset", "arrow_flight", "arrow_flight_sql", - "arrow_substrait", "gandiva", "parquet" + "arrow", "arrow_acero", "arrow_dataset", "arrow_flight", + "arrow_flight_sql", "arrow_substrait", "gandiva", "parquet" ] %} {% for each_lib in libs %} # shared @@ -189,6 +198,8 @@ outputs: requirements: host: - {{ pin_subpackage('libarrow', exact=True) }} + # avoid wrappers for different builds colliding due to identical hashes + - libprotobuf run: - {{ pin_subpackage('libarrow', exact=True) }} test: @@ -196,21 +207,21 @@ outputs: - exit 0 - name: pyarrow - script: build-pyarrow.sh # [not win] - script: bld-pyarrow.bat # [win] + script: build-pyarrow.sh # [unix] + script: build-pyarrow.bat # [win] version: {{ version }} build: string: py{{ CONDA_PY }}h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}_{{ build_ext }} - ignore_run_exports: - - cudatoolkit + ignore_run_exports_from: + - {{ compiler("cuda") }} # [cuda_compiler_version != "None"] track_features: {{ "[arrow-cuda]" if cuda_enabled else "" }} rpaths: - lib/ - {{ SP_DIR }}/pyarrow missing_dso_whitelist: # not actually missing, but installed into SP_DIR, see tests - - "*/arrow_python.dll" # [win] - - "*/arrow_python_flight.dll" # [win] + - '*/arrow_python.dll' # [win] + - '*/arrow_python_flight.dll' # [win] requirements: build: - {{ compiler("c") }} @@ -219,29 +230,28 @@ outputs: - {{ compiler("cuda") }} # [cuda_compiler_version != "None"] - python # [build_platform != target_platform] - cross-python_{{ target_platform }} # [build_platform != target_platform] - - cython # [build_platform != target_platform] + - cython <3 # [build_platform != target_platform] - numpy # [build_platform != target_platform] - cmake - ninja host: - - {{ pin_subpackage('libarrow', exact=True) }} + # we're building for two protobuf versions, cannot pin exactly + # - {{ pin_subpackage('libarrow', exact=True) }} + - libarrow ={{ version }}=*_{{ PKG_BUILDNUM }}_{{ build_ext }} - clangdev {{ llvm_version }} - llvmdev {{ llvm_version }} - - cython - - gflags # [unix] + - cython <3 - numpy - python - setuptools - setuptools_scm run: - - {{ pin_subpackage('libarrow', exact=True) }} + # - {{ pin_subpackage('libarrow', exact=True) }} + - libarrow ={{ version }}=*_{{ PKG_BUILDNUM }}_{{ build_ext }} - {{ pin_compatible('numpy') }} - # empty parquet-cpp metapackage, force old versions to be uninstalled - - parquet-cpp 1.5.1.* - python run_constrained: - apache-arrow-proc =*={{ build_ext }} - - cudatoolkit >={{ cuda_compiler_version_min }} # [cuda_compiler_version != "None"] test: files: @@ -288,13 +298,13 @@ outputs: summary: Python libraries for Apache Arrow - name: pyarrow-tests - script: build-pyarrow.sh # [not win] - script: bld-pyarrow.bat # [win] + script: build-pyarrow.sh # [unix] + script: build-pyarrow.bat # [win] version: {{ version }} build: string: py{{ CONDA_PY }}h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}_{{ build_ext }} - ignore_run_exports: - - cudatoolkit + ignore_run_exports_from: + - {{ compiler("cuda") }} # [cuda_compiler_version != "None"] track_features: {{ "[arrow-cuda]" if cuda_enabled else "" }} requirements: build: @@ -304,7 +314,7 @@ outputs: - {{ compiler("cuda") }} # [cuda_compiler_version != "None"] - python # [build_platform != target_platform] - cross-python_{{ target_platform }} # [build_platform != target_platform] - - cython # [build_platform != target_platform] + - cython <3 # [build_platform != target_platform] - numpy # [build_platform != target_platform] - cmake - ninja @@ -313,7 +323,7 @@ outputs: - {{ pin_subpackage('pyarrow', exact=True) }} - clangdev {{ llvm_version }} - llvmdev {{ llvm_version }} - - cython + - cython <3 - numpy - python - setuptools @@ -323,27 +333,36 @@ outputs: - python run_constrained: - apache-arrow-proc =*={{ build_ext }} - - cudatoolkit >={{ cuda_compiler_version_min }} # [cuda_compiler_version != "None"] + # crossbow CI: reduce to one python version, except on (unemulated) linux, where it's fast enough + {% if linux64 or py == 311 %} + # {% if not (aarch64 or ppc64le) or py in (310, 311) %} + # only run the full test suite for one python version when in emulation (each run takes ~45min); + # there's essentially zero divergence in behaviour across python versions anyway, and otherwise + # CUDA builds for aarch/ppc consistently run out of disk space on azure for some reason test: requires: + # vary protobuf version in test suite (historically, test failures only have a very + # weak dependency on python version, so we don't lose coverage by doing half & half) + - libprotobuf <4 # [py % 2 == 0] # test_cpp_extension_in_python requires a compiler - {{ compiler("cxx") }} # [linux] - - pytest + # temporary pin due to missing fixture + - pytest <7.4.0 - pytest-lazy-fixture - backports.zoneinfo # [py<39] - cffi - cloudpickle - - cython + - cython <3 - fastparquet - fsspec - hypothesis - pandas - scipy - # not all OSes/arches available in conda-forge - - pytorch * # [unix and not ppc64le] - # not yet rebuilt for libabseil 20230125 - # - tensorflow # [unix and x86_64 and py<311] + # these are generally (far) behind on migrating abseil/grpc/protobuf, + # and using them as test dependencies blocks the migrator unnecessarily + # - pytorch + # - tensorflow # we're not building java bindings # - jpype1 # doesn't get picked up correctly @@ -364,6 +383,8 @@ outputs: # skip tests that raise SIGINT and crash the test suite {% set tests_to_skip = tests_to_skip + " or (test_csv and test_cancellation)" %} # [linux] {% set tests_to_skip = tests_to_skip + " or (test_flight and test_interrupt)" %} # [linux] + # tests that may crash the agent due to out-of-bound memory writes or other risky stuff + {% set tests_to_skip = tests_to_skip + " or test_debug_memory_pool" %} # [aarch64 or ppc64le] # cannot pass -D_LIBCPP_DISABLE_AVAILABILITY to test suite for our older macos sdk {% set tests_to_skip = tests_to_skip + " or test_cpp_extension_in_python" %} # [osx] # skip tests that make invalid(-for-conda) assumptions about the compilers setup @@ -373,6 +394,7 @@ outputs: {% set tests_to_skip = tests_to_skip + " or test_debug_memory_pool_disabled" %} # [aarch64 or ppc64le] {% set tests_to_skip = tests_to_skip + " or test_env_var_io_thread_count" %} # [aarch64 or ppc64le] # vvvvvvv TESTS THAT SHOULDN'T HAVE TO BE SKIPPED vvvvvvv + {% set tests_to_skip = tests_to_skip + " or test_extension_to_pandas_storage_type" %} # segfaults on OSX: to investigate ASAP {% set tests_to_skip = tests_to_skip + " or test_flight" %} # [osx] # gandiva tests are segfaulting on ppc @@ -385,10 +407,12 @@ outputs: {% set tests_to_skip = tests_to_skip + " or (test_memory and test_env_var)" %} # [unix] # test is broken; header is in $PREFIX, not $SP_DIR {% set tests_to_skip = tests_to_skip + " or (test_misc and test_get_include)" %} # [unix] + # flaky tests that fail occasionally + {% set tests_to_skip = tests_to_skip + " or test_total_bytes_allocated " %} # [linux] + {% set tests_to_skip = tests_to_skip + " or test_feather_format " %} # [linux] # ^^^^^^^ TESTS THAT SHOULDN'T HAVE TO BE SKIPPED ^^^^^^^ - - # crossbow CI: reduce to one python version, except on (unemulated) linux, where it's fast enough - - pytest -v -rfEs -k "not ({{ tests_to_skip }})" # [linux64 or (py==310 and build_platform==target_platform)] + - pytest -rfEs -k "not ({{ tests_to_skip }})" + {% endif %} about: home: http://github.com/apache/arrow diff --git a/dev/tasks/conda-recipes/azure.linux.yml b/dev/tasks/conda-recipes/azure.linux.yml index b9a54647cc525..279ffb48ccd60 100755 --- a/dev/tasks/conda-recipes/azure.linux.yml +++ b/dev/tasks/conda-recipes/azure.linux.yml @@ -13,6 +13,29 @@ jobs: UPLOAD_PACKAGES: False steps: + - script: | + sudo mkdir -p /opt/empty_dir || true + for d in \ + /opt/ghc \ + /opt/hostedtoolcache \ + /usr/lib/jvm \ + /usr/local/.ghcup \ + /usr/local/lib/android \ + /usr/local/share/powershell \ + /usr/share/dotnet \ + /usr/share/swift \ + ; do + sudo rsync --stats -a --delete /opt/empty_dir/ $d || true + done + sudo apt-get purge -y -f firefox \ + google-chrome-stable \ + microsoft-edge-stable + sudo apt-get autoremove -y >& /dev/null + sudo apt-get autoclean -y >& /dev/null + sudo docker image prune --all --force + df -h + displayName: Manage disk space + # configure qemu binfmt-misc running. This allows us to run docker containers # embedded qemu-static - script: | diff --git a/dev/tasks/conda-recipes/r-arrow/configure.win b/dev/tasks/conda-recipes/r-arrow/configure.win index fb16a810b8f22..0fc96576bde74 100755 --- a/dev/tasks/conda-recipes/r-arrow/configure.win +++ b/dev/tasks/conda-recipes/r-arrow/configure.win @@ -2,7 +2,7 @@ set -euxo pipefail -echo "PKG_CPPFLAGS=-DNDEBUG -I\"${LIBRARY_PREFIX}/include\" -I\"${PREFIX}/include\" -DARROW_R_WITH_PARQUET -DARROW_R_WITH_DATASET -DARROW_R_WITH_S3 -DARROW_R_WITH_JSON" > src/Makevars.win +echo "PKG_CPPFLAGS=-DNDEBUG -I\"${LIBRARY_PREFIX}/include\" -I\"${PREFIX}/include\" -DARROW_R_WITH_ACERO -DARROW_R_WITH_PARQUET -DARROW_R_WITH_DATASET -DARROW_R_WITH_S3 -DARROW_R_WITH_JSON" > src/Makevars.win echo "PKG_CXXFLAGS=\$(CXX_VISIBILITY)" >> src/Makevars.win echo 'CXX_STD=CXX17' >> src/Makevars.win -echo "PKG_LIBS=-L\"${LIBRARY_PREFIX}/lib\" -larrow_dataset -lparquet -larrow" >> src/Makevars.win +echo "PKG_LIBS=-L\"${LIBRARY_PREFIX}/lib\" -larrow_dataset -larrow_acero -lparquet -larrow" >> src/Makevars.win diff --git a/dev/tasks/conda-recipes/r-arrow/meta.yaml b/dev/tasks/conda-recipes/r-arrow/meta.yaml index 28ee8eb92c921..e8b834254f41c 100644 --- a/dev/tasks/conda-recipes/r-arrow/meta.yaml +++ b/dev/tasks/conda-recipes/r-arrow/meta.yaml @@ -10,8 +10,6 @@ source: path: ../../../../ build: - # 4.1 not usable anymore unless https://github.com/conda-forge/r-base-feedstock/pull/236 gets merged - skip: true # [unix and (r_base == "4.1")] merge_build_host: true # [win] number: 0 rpaths: diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index cca770438574a..05dafade97434 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -246,6 +246,16 @@ tasks: # generated and to be synced regularly from the feedstock. We have no way # yet to generate them inside the arrow repository automatically. + conda-linux-x64-cpu-r41: + ci: azure + template: conda-recipes/azure.linux.yml + params: + config: linux_64_cuda_compiler_versionNone + r_config: linux_64_r_base4.1 + artifacts: + - libarrow-{no_rc_version}-(h[a-z0-9]+)_0_cpu.conda + - r-arrow-{no_rc_version}-r41(h[a-z0-9]+)_0.conda + conda-linux-x64-cpu-r42: ci: azure template: conda-recipes/azure.linux.yml @@ -272,7 +282,7 @@ tasks: ci: azure template: conda-recipes/azure.linux.yml params: - config: linux_64_cuda_compiler_version10.2 + config: linux_64_cuda_compiler_version11.2 artifacts: - libarrow-{no_rc_version}-(h[a-z0-9]+)_0_cuda.conda - pyarrow-{no_rc_version}-py38(h[a-z0-9]+)_0_cuda.conda @@ -282,6 +292,16 @@ tasks: ########################### Conda Linux (aarch64) ########################### + conda-linux-aarch64-cpu-r41: + ci: azure + template: conda-recipes/azure.linux.yml + params: + config: linux_aarch64_cuda_compiler_versionNone + r_config: linux_aarch64_r_base4.1 + artifacts: + - libarrow-{no_rc_version}-(h[a-z0-9]+)_0_cpu.conda + - r-arrow-{no_rc_version}-r41(h[a-z0-9]+)_0.conda + conda-linux-aarch64-cpu-r42: ci: azure template: conda-recipes/azure.linux.yml @@ -304,6 +324,18 @@ tasks: - pyarrow-{no_rc_version}-py310(h[a-z0-9]+)_0_cpu.conda - pyarrow-{no_rc_version}-py311(h[a-z0-9]+)_0_cpu.conda + conda-linux-aarch64-cuda-py3: + ci: azure + template: conda-recipes/azure.linux.yml + params: + config: linux_aarch64_cuda_compiler_version11.2 + artifacts: + - libarrow-{no_rc_version}-(h[a-z0-9]+)_0_cuda.conda + - pyarrow-{no_rc_version}-py38(h[a-z0-9]+)_0_cuda.conda + - pyarrow-{no_rc_version}-py39(h[a-z0-9]+)_0_cuda.conda + - pyarrow-{no_rc_version}-py310(h[a-z0-9]+)_0_cuda.conda + - pyarrow-{no_rc_version}-py311(h[a-z0-9]+)_0_cuda.conda + ########################### Conda Linux (ppc64le) ########################### conda-linux-ppc64le-cpu-py3: @@ -318,8 +350,30 @@ tasks: - pyarrow-{no_rc_version}-py310(h[a-z0-9]+)_0_cpu.conda - pyarrow-{no_rc_version}-py311(h[a-z0-9]+)_0_cpu.conda + conda-linux-ppc64le-cuda-py3: + ci: azure + template: conda-recipes/azure.linux.yml + params: + config: linux_ppc64le_cuda_compiler_version11.2 + artifacts: + - libarrow-{no_rc_version}-(h[a-z0-9]+)_0_cuda.conda + - pyarrow-{no_rc_version}-py38(h[a-z0-9]+)_0_cuda.conda + - pyarrow-{no_rc_version}-py39(h[a-z0-9]+)_0_cuda.conda + - pyarrow-{no_rc_version}-py310(h[a-z0-9]+)_0_cuda.conda + - pyarrow-{no_rc_version}-py311(h[a-z0-9]+)_0_cuda.conda + ############################## Conda OSX (x64) ############################## + conda-osx-x64-cpu-r41: + ci: azure + template: conda-recipes/azure.osx.yml + params: + config: osx_64_ + r_config: osx_64_r_base4.1 + artifacts: + - libarrow-{no_rc_version}-(h[a-z0-9]+)_0_cpu.conda + - r-arrow-{no_rc_version}-r41(h[a-z0-9]+)_0.conda + conda-osx-x64-cpu-r42: ci: azure template: conda-recipes/azure.osx.yml @@ -344,6 +398,16 @@ tasks: ############################# Conda OSX (arm64) ############################# + conda-osx-arm64-cpu-r41: + ci: azure + template: conda-recipes/azure.osx.yml + params: + config: osx_arm64_ + r_config: osx_arm64_r_base4.1 + artifacts: + - libarrow-{no_rc_version}-(h[a-z0-9]+)_0_cpu.conda + - r-arrow-{no_rc_version}-r41(h[a-z0-9]+)_0.conda + conda-osx-arm64-cpu-r42: ci: azure template: conda-recipes/azure.osx.yml @@ -396,7 +460,7 @@ tasks: ci: azure template: conda-recipes/azure.win.yml params: - config: win_64_cuda_compiler_versionNone + config: win_64_cuda_compiler_version11.2 artifacts: - libarrow-{no_rc_version}-(h[a-z0-9]+)_0_cuda.conda - pyarrow-{no_rc_version}-py38(h[a-z0-9]+)_0_cuda.conda From 9fe23dac68504ae99b0bf182a412c776fce227cb Mon Sep 17 00:00:00 2001 From: sgilmore10 <74676073+sgilmore10@users.noreply.github.com> Date: Wed, 26 Jul 2023 10:50:54 -0400 Subject: [PATCH 052/749] GH-36853: [MATLAB] Add utility to create proxies from existing `arrow::DataType` objects (#36873) ### Rationale for this change There will be many places in the MATLAB interface code base in which we will have to wrap an `arrow::DataType` object within a subclass of `arrow::matlab::type::proxy::Type`. To avoid code duplicaiton, we should add a utility function called `wrap` that accepts a pointer to an `arrow::DataType` object and returns a pointer to a `arrow::matlab::type::proxy::Type` object. ### What changes are included in this PR? 1. Added a new function with the following signature: ```cpp arrow::Result> wrap(const std::shared_ptr& datatype); ``` 2. Updated the `type` methods of `arrow::matlab::type::proxy::Field` and `arrow::matlab::array::proxy::Array` to use `wrap`. ### Are these changes tested? No new tests needed. ### Are there any user-facing changes? No * Closes: #36853 Lead-authored-by: Sarah Gilmore Co-authored-by: sgilmore10 <74676073+sgilmore10@users.noreply.github.com> Co-authored-by: Sutou Kouhei Signed-off-by: Kevin Gurney --- .../src/cpp/arrow/matlab/array/proxy/array.cc | 11 ++-- .../src/cpp/arrow/matlab/array/proxy/array.h | 2 - .../arrow/matlab/array/proxy/boolean_array.cc | 7 --- .../arrow/matlab/array/proxy/boolean_array.h | 3 - .../arrow/matlab/array/proxy/numeric_array.h | 6 -- .../arrow/matlab/array/proxy/string_array.cc | 8 --- .../arrow/matlab/array/proxy/string_array.h | 3 - matlab/src/cpp/arrow/matlab/error/error.h | 1 + .../src/cpp/arrow/matlab/type/proxy/field.cc | 41 +------------ .../src/cpp/arrow/matlab/type/proxy/wrap.cc | 59 +++++++++++++++++++ matlab/src/cpp/arrow/matlab/type/proxy/wrap.h | 28 +++++++++ .../cmake/BuildMatlabArrowInterface.cmake | 4 +- 12 files changed, 101 insertions(+), 72 deletions(-) create mode 100644 matlab/src/cpp/arrow/matlab/type/proxy/wrap.cc create mode 100644 matlab/src/cpp/arrow/matlab/type/proxy/wrap.h diff --git a/matlab/src/cpp/arrow/matlab/array/proxy/array.cc b/matlab/src/cpp/arrow/matlab/array/proxy/array.cc index c2d0330b5f78e..8520cf1f21fdd 100644 --- a/matlab/src/cpp/arrow/matlab/array/proxy/array.cc +++ b/matlab/src/cpp/arrow/matlab/array/proxy/array.cc @@ -20,8 +20,8 @@ #include "arrow/matlab/array/proxy/array.h" #include "arrow/matlab/bit/unpack.h" #include "arrow/matlab/error/error.h" +#include "arrow/matlab/type/proxy/wrap.h" #include "arrow/type_traits.h" -#include "arrow/visit_array_inline.h" #include "libmexclass/proxy/ProxyManager.h" @@ -80,12 +80,15 @@ namespace arrow::matlab::array::proxy { mda::ArrayFactory factory; - auto type_proxy = typeProxy(); + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto type_proxy, + type::proxy::wrap(array->type()), + context, + error::ARRAY_FAILED_TO_CREATE_TYPE_PROXY); + auto type_id = type_proxy->unwrap()->id(); auto proxy_id = libmexclass::proxy::ProxyManager::manageProxy(type_proxy); context.outputs[0] = factory.createScalar(proxy_id); context.outputs[1] = factory.createScalar(static_cast(type_id)); - } -} \ No newline at end of file +} diff --git a/matlab/src/cpp/arrow/matlab/array/proxy/array.h b/matlab/src/cpp/arrow/matlab/array/proxy/array.h index 55d48c26eff6f..90199767258e2 100644 --- a/matlab/src/cpp/arrow/matlab/array/proxy/array.h +++ b/matlab/src/cpp/arrow/matlab/array/proxy/array.h @@ -44,8 +44,6 @@ class Array : public libmexclass::proxy::Proxy { virtual void toMATLAB(libmexclass::proxy::method::Context& context) = 0; - virtual std::shared_ptr typeProxy() = 0; - std::shared_ptr array; }; diff --git a/matlab/src/cpp/arrow/matlab/array/proxy/boolean_array.cc b/matlab/src/cpp/arrow/matlab/array/proxy/boolean_array.cc index 281a0f732d73a..5be0cfb5a3d13 100644 --- a/matlab/src/cpp/arrow/matlab/array/proxy/boolean_array.cc +++ b/matlab/src/cpp/arrow/matlab/array/proxy/boolean_array.cc @@ -54,11 +54,4 @@ namespace arrow::matlab::array::proxy { auto logical_array_mda = bit::unpack(packed_logical_data_buffer, array_length); context.outputs[0] = logical_array_mda; } - - std::shared_ptr BooleanArray::typeProxy() { - using BooleanTypeProxy = type::proxy::PrimitiveCType; - - auto type = std::static_pointer_cast(array->type()); - return std::make_shared(std::move(type)); - } } diff --git a/matlab/src/cpp/arrow/matlab/array/proxy/boolean_array.h b/matlab/src/cpp/arrow/matlab/array/proxy/boolean_array.h index 5e6e51f0bc8ff..775673c29eada 100644 --- a/matlab/src/cpp/arrow/matlab/array/proxy/boolean_array.h +++ b/matlab/src/cpp/arrow/matlab/array/proxy/boolean_array.h @@ -32,9 +32,6 @@ namespace arrow::matlab::array::proxy { protected: void toMATLAB(libmexclass::proxy::method::Context& context) override; - - std::shared_ptr typeProxy() override; - }; } diff --git a/matlab/src/cpp/arrow/matlab/array/proxy/numeric_array.h b/matlab/src/cpp/arrow/matlab/array/proxy/numeric_array.h index f358e05db6318..6893079c78b95 100644 --- a/matlab/src/cpp/arrow/matlab/array/proxy/numeric_array.h +++ b/matlab/src/cpp/arrow/matlab/array/proxy/numeric_array.h @@ -85,12 +85,6 @@ class NumericArray : public arrow::matlab::array::proxy::Array { ::matlab::data::TypedArray result = factory.createArray({num_elements, 1}, data_begin, data_end); context.outputs[0] = result; } - - std::shared_ptr typeProxy() override { - using TypeProxy = typename type::proxy::Traits::TypeProxy; - auto type = std::static_pointer_cast(array->type()); - return std::make_shared(std::move(type)); - } }; // Specialization of NumericArray::Make for arrow::TimestampType. diff --git a/matlab/src/cpp/arrow/matlab/array/proxy/string_array.cc b/matlab/src/cpp/arrow/matlab/array/proxy/string_array.cc index 16331f6195a22..c583e8851a3ac 100644 --- a/matlab/src/cpp/arrow/matlab/array/proxy/string_array.cc +++ b/matlab/src/cpp/arrow/matlab/array/proxy/string_array.cc @@ -81,12 +81,4 @@ namespace arrow::matlab::array::proxy { auto array_mda = factory.createArray({array_length, 1}, strings.begin(), strings.end()); context.outputs[0] = array_mda; } - - std::shared_ptr StringArray::typeProxy() { - using StringTypeProxy = type::proxy::StringType; - - auto type = std::static_pointer_cast(array->type()); - return std::make_shared(std::move(type)); - } - } diff --git a/matlab/src/cpp/arrow/matlab/array/proxy/string_array.h b/matlab/src/cpp/arrow/matlab/array/proxy/string_array.h index abb2322edbd20..bdcfedd7cdda3 100644 --- a/matlab/src/cpp/arrow/matlab/array/proxy/string_array.h +++ b/matlab/src/cpp/arrow/matlab/array/proxy/string_array.h @@ -33,9 +33,6 @@ namespace arrow::matlab::array::proxy { protected: void toMATLAB(libmexclass::proxy::method::Context& context) override; - - std::shared_ptr typeProxy() override; - }; } diff --git a/matlab/src/cpp/arrow/matlab/error/error.h b/matlab/src/cpp/arrow/matlab/error/error.h index b253e6c20ed27..3d134d169e7af 100644 --- a/matlab/src/cpp/arrow/matlab/error/error.h +++ b/matlab/src/cpp/arrow/matlab/error/error.h @@ -172,4 +172,5 @@ namespace arrow::matlab::error { static const char* STRING_BUILDER_FINISH_FAILED = "arrow:matlab:array:string:StringBuilderFinishFailed"; static const char* UKNOWN_TIME_UNIT_ERROR_ID = "arrow:matlab:UnknownTimeUnit"; static const char* FIELD_FAILED_TO_CREATE_TYPE_PROXY = "arrow:field:FailedToCreateTypeProxy"; + static const char* ARRAY_FAILED_TO_CREATE_TYPE_PROXY = "arrow:array:FailedToCreateTypeProxy"; } diff --git a/matlab/src/cpp/arrow/matlab/type/proxy/field.cc b/matlab/src/cpp/arrow/matlab/type/proxy/field.cc index 4a43d813f0567..0cf7c995fb275 100644 --- a/matlab/src/cpp/arrow/matlab/type/proxy/field.cc +++ b/matlab/src/cpp/arrow/matlab/type/proxy/field.cc @@ -23,6 +23,7 @@ #include "arrow/matlab/type/proxy/primitive_ctype.h" #include "arrow/matlab/type/proxy/timestamp_type.h" #include "arrow/matlab/type/proxy/string_type.h" +#include "arrow/matlab/type/proxy/wrap.h" #include "libmexclass/proxy/ProxyManager.h" @@ -48,47 +49,11 @@ namespace arrow::matlab::type::proxy { context.outputs[0] = str_mda; } - arrow::Result> makeTypeProxy(const std::shared_ptr& datatype) { - using arrow_type = arrow::Type::type; - namespace type_proxy = arrow::matlab::type::proxy; - switch (datatype->id()) { - case arrow_type::UINT8: - return std::make_shared>(std::static_pointer_cast(datatype)); - case arrow_type::UINT16: - return std::make_shared>(std::static_pointer_cast(datatype)); - case arrow_type::UINT32: - return std::make_shared>(std::static_pointer_cast(datatype)); - case arrow_type::UINT64: - return std::make_shared>(std::static_pointer_cast(datatype)); - case arrow_type::INT8: - return std::make_shared>(std::static_pointer_cast(datatype)); - case arrow_type::INT16: - return std::make_shared>(std::static_pointer_cast(datatype)); - case arrow_type::INT32: - return std::make_shared>(std::static_pointer_cast(datatype)); - case arrow_type::INT64: - return std::make_shared>(std::static_pointer_cast(datatype)); - case arrow_type::FLOAT: - return std::make_shared>(std::static_pointer_cast(datatype)); - case arrow_type::DOUBLE: - return std::make_shared>(std::static_pointer_cast(datatype)); - case arrow_type::BOOL: - return std::make_shared>(std::static_pointer_cast(datatype)); - case arrow_type::STRING: - return std::make_shared(std::static_pointer_cast(datatype)); - case arrow_type::TIMESTAMP: - return std::make_shared(std::static_pointer_cast(datatype)); - default: - return arrow::Status::NotImplemented("Unsupported DataType: " + datatype->ToString()); - } - } - - void Field::type(libmexclass::proxy::method::Context& context) { namespace mda = ::matlab::data; - auto datatype = field->type(); - MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto proxy, makeTypeProxy(datatype), context, "arrow:field:FailedToCreateTypeProxy"); + const auto& datatype = field->type(); + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto proxy, type::proxy::wrap(datatype), context, error::FIELD_FAILED_TO_CREATE_TYPE_PROXY); const auto proxy_id = libmexclass::proxy::ProxyManager::manageProxy(proxy); mda::ArrayFactory factory; diff --git a/matlab/src/cpp/arrow/matlab/type/proxy/wrap.cc b/matlab/src/cpp/arrow/matlab/type/proxy/wrap.cc new file mode 100644 index 0000000000000..b01148fe1c0a9 --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/type/proxy/wrap.cc @@ -0,0 +1,59 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/matlab/type/proxy/wrap.h" + +#include "arrow/matlab/type/proxy/primitive_ctype.h" +#include "arrow/matlab/type/proxy/timestamp_type.h" +#include "arrow/matlab/type/proxy/string_type.h" + +namespace arrow::matlab::type::proxy { + + arrow::Result> wrap(const std::shared_ptr& type) { + using ID = arrow::Type::type; + switch (type->id()) { + case ID::BOOL: + return std::make_shared>(std::static_pointer_cast(type)); + case ID::UINT8: + return std::make_shared>(std::static_pointer_cast(type)); + case ID::UINT16: + return std::make_shared>(std::static_pointer_cast(type)); + case ID::UINT32: + return std::make_shared>(std::static_pointer_cast(type)); + case ID::UINT64: + return std::make_shared>(std::static_pointer_cast(type)); + case ID::INT8: + return std::make_shared>(std::static_pointer_cast(type)); + case ID::INT16: + return std::make_shared>(std::static_pointer_cast(type)); + case ID::INT32: + return std::make_shared>(std::static_pointer_cast(type)); + case ID::INT64: + return std::make_shared>(std::static_pointer_cast(type)); + case ID::FLOAT: + return std::make_shared>(std::static_pointer_cast(type)); + case ID::DOUBLE: + return std::make_shared>(std::static_pointer_cast(type)); + case ID::TIMESTAMP: + return std::make_shared(std::static_pointer_cast(type)); + case ID::STRING: + return std::make_shared(std::static_pointer_cast(type)); + default: + return arrow::Status::NotImplemented("Unsupported DataType: " + type->ToString()); + } + } +} diff --git a/matlab/src/cpp/arrow/matlab/type/proxy/wrap.h b/matlab/src/cpp/arrow/matlab/type/proxy/wrap.h new file mode 100644 index 0000000000000..f5e2d30f8f4ec --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/type/proxy/wrap.h @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/type.h" +#include "arrow/result.h" + +#include "arrow/matlab/type/proxy/type.h" + +namespace arrow::matlab::type::proxy { + +arrow::Result> wrap(const std::shared_ptr& type); + + +} diff --git a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake index c5a7c08aa5c10..530799c15c172 100644 --- a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake +++ b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake @@ -52,7 +52,9 @@ set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_SOURCES "${CMAKE_SOURCE_DIR}/src/cpp/a "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/fixed_width_type.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/string_type.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/timestamp_type.cc" - "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/field.cc") + "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/field.cc" + "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/wrap.cc") + set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_FACTORY_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/proxy") set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_FACTORY_SOURCES "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/proxy/factory.cc") From 32659a2d0e939bf853129f814fab9bcdec01a3bc Mon Sep 17 00:00:00 2001 From: sgilmore10 <74676073+sgilmore10@users.noreply.github.com> Date: Wed, 26 Jul 2023 11:08:24 -0400 Subject: [PATCH 053/749] GH-36874: [MATLAB] Move type constructor functions from the `arrow.type` package to `arrow` package (#36875) ### Rationale for this change When working on PR #36855, we realized it would be better to place all recommended public/user-facing APIs in the top-level `arrow` package. That's why we added the function `arrow.field` in the top-level `arrow` package instead of under `arrow.type`, even though the `Field` class is within `arrow.type` package. ### What changes are included in this PR? 1. Moved the type constructor functions (`arrow.type.int8()`, `arrow.type.timestamp()`, etc) from the `arrow.type` package to the `arrow package`. **Example: Old Way to Create a `Timestamp` Object:** ```matlab >> type = arrow.type.timestmap(TimeUnit="nanosecond"); ``` **Example: New Way to Create a `Timestamp` Object:** ```matlab >> type = arrow.timestmap(TimeUnit="nanosecond"); ``` ### Are these changes tested? Changes are covered by existing tests. ### Are there any user-facing changes? Yes. NOTE: This is a breaking change, but the MATLAB interface is not yet stable. * Closes: #36874 Authored-by: Sarah Gilmore Signed-off-by: Kevin Gurney --- .../src/matlab/+arrow/{+type => }/boolean.m | 0 .../src/matlab/+arrow/{+type => }/float32.m | 0 .../src/matlab/+arrow/{+type => }/float64.m | 0 matlab/src/matlab/+arrow/{+type => }/int16.m | 0 matlab/src/matlab/+arrow/{+type => }/int32.m | 0 matlab/src/matlab/+arrow/{+type => }/int64.m | 0 matlab/src/matlab/+arrow/{+type => }/int8.m | 0 matlab/src/matlab/+arrow/{+type => }/string.m | 0 .../src/matlab/+arrow/{+type => }/timestamp.m | 0 matlab/src/matlab/+arrow/{+type => }/uint16.m | 0 matlab/src/matlab/+arrow/{+type => }/uint32.m | 0 matlab/src/matlab/+arrow/{+type => }/uint64.m | 0 matlab/src/matlab/+arrow/{+type => }/uint8.m | 0 matlab/test/arrow/array/tBooleanArray.m | 2 +- matlab/test/arrow/array/tFloat32Array.m | 2 +- matlab/test/arrow/array/tFloat64Array.m | 2 +- matlab/test/arrow/array/tInt16Array.m | 2 +- matlab/test/arrow/array/tInt32Array.m | 2 +- matlab/test/arrow/array/tInt64Array.m | 2 +- matlab/test/arrow/array/tInt8Array.m | 2 +- matlab/test/arrow/array/tStringArray.m | 2 +- matlab/test/arrow/array/tUInt16Array.m | 2 +- matlab/test/arrow/array/tUInt32Array.m | 2 +- matlab/test/arrow/array/tUInt64Array.m | 2 +- matlab/test/arrow/array/tUInt8Array.m | 2 +- matlab/test/arrow/type/tBooleanType.m | 2 +- matlab/test/arrow/type/tField.m | 44 +++++++++---------- matlab/test/arrow/type/tFloat32Type.m | 2 +- matlab/test/arrow/type/tFloat64Type.m | 2 +- matlab/test/arrow/type/tInt16Type.m | 2 +- matlab/test/arrow/type/tInt32Type.m | 2 +- matlab/test/arrow/type/tInt64Type.m | 2 +- matlab/test/arrow/type/tInt8Type.m | 2 +- matlab/test/arrow/type/tStringType.m | 4 +- matlab/test/arrow/type/tTimestampType.m | 24 +++++----- matlab/test/arrow/type/tUInt16Type.m | 2 +- matlab/test/arrow/type/tUInt32Type.m | 2 +- matlab/test/arrow/type/tUInt64Type.m | 2 +- matlab/test/arrow/type/tUInt8Type.m | 2 +- 39 files changed, 59 insertions(+), 59 deletions(-) rename matlab/src/matlab/+arrow/{+type => }/boolean.m (100%) rename matlab/src/matlab/+arrow/{+type => }/float32.m (100%) rename matlab/src/matlab/+arrow/{+type => }/float64.m (100%) rename matlab/src/matlab/+arrow/{+type => }/int16.m (100%) rename matlab/src/matlab/+arrow/{+type => }/int32.m (100%) rename matlab/src/matlab/+arrow/{+type => }/int64.m (100%) rename matlab/src/matlab/+arrow/{+type => }/int8.m (100%) rename matlab/src/matlab/+arrow/{+type => }/string.m (100%) rename matlab/src/matlab/+arrow/{+type => }/timestamp.m (100%) rename matlab/src/matlab/+arrow/{+type => }/uint16.m (100%) rename matlab/src/matlab/+arrow/{+type => }/uint32.m (100%) rename matlab/src/matlab/+arrow/{+type => }/uint64.m (100%) rename matlab/src/matlab/+arrow/{+type => }/uint8.m (100%) diff --git a/matlab/src/matlab/+arrow/+type/boolean.m b/matlab/src/matlab/+arrow/boolean.m similarity index 100% rename from matlab/src/matlab/+arrow/+type/boolean.m rename to matlab/src/matlab/+arrow/boolean.m diff --git a/matlab/src/matlab/+arrow/+type/float32.m b/matlab/src/matlab/+arrow/float32.m similarity index 100% rename from matlab/src/matlab/+arrow/+type/float32.m rename to matlab/src/matlab/+arrow/float32.m diff --git a/matlab/src/matlab/+arrow/+type/float64.m b/matlab/src/matlab/+arrow/float64.m similarity index 100% rename from matlab/src/matlab/+arrow/+type/float64.m rename to matlab/src/matlab/+arrow/float64.m diff --git a/matlab/src/matlab/+arrow/+type/int16.m b/matlab/src/matlab/+arrow/int16.m similarity index 100% rename from matlab/src/matlab/+arrow/+type/int16.m rename to matlab/src/matlab/+arrow/int16.m diff --git a/matlab/src/matlab/+arrow/+type/int32.m b/matlab/src/matlab/+arrow/int32.m similarity index 100% rename from matlab/src/matlab/+arrow/+type/int32.m rename to matlab/src/matlab/+arrow/int32.m diff --git a/matlab/src/matlab/+arrow/+type/int64.m b/matlab/src/matlab/+arrow/int64.m similarity index 100% rename from matlab/src/matlab/+arrow/+type/int64.m rename to matlab/src/matlab/+arrow/int64.m diff --git a/matlab/src/matlab/+arrow/+type/int8.m b/matlab/src/matlab/+arrow/int8.m similarity index 100% rename from matlab/src/matlab/+arrow/+type/int8.m rename to matlab/src/matlab/+arrow/int8.m diff --git a/matlab/src/matlab/+arrow/+type/string.m b/matlab/src/matlab/+arrow/string.m similarity index 100% rename from matlab/src/matlab/+arrow/+type/string.m rename to matlab/src/matlab/+arrow/string.m diff --git a/matlab/src/matlab/+arrow/+type/timestamp.m b/matlab/src/matlab/+arrow/timestamp.m similarity index 100% rename from matlab/src/matlab/+arrow/+type/timestamp.m rename to matlab/src/matlab/+arrow/timestamp.m diff --git a/matlab/src/matlab/+arrow/+type/uint16.m b/matlab/src/matlab/+arrow/uint16.m similarity index 100% rename from matlab/src/matlab/+arrow/+type/uint16.m rename to matlab/src/matlab/+arrow/uint16.m diff --git a/matlab/src/matlab/+arrow/+type/uint32.m b/matlab/src/matlab/+arrow/uint32.m similarity index 100% rename from matlab/src/matlab/+arrow/+type/uint32.m rename to matlab/src/matlab/+arrow/uint32.m diff --git a/matlab/src/matlab/+arrow/+type/uint64.m b/matlab/src/matlab/+arrow/uint64.m similarity index 100% rename from matlab/src/matlab/+arrow/+type/uint64.m rename to matlab/src/matlab/+arrow/uint64.m diff --git a/matlab/src/matlab/+arrow/+type/uint8.m b/matlab/src/matlab/+arrow/uint8.m similarity index 100% rename from matlab/src/matlab/+arrow/+type/uint8.m rename to matlab/src/matlab/+arrow/uint8.m diff --git a/matlab/test/arrow/array/tBooleanArray.m b/matlab/test/arrow/array/tBooleanArray.m index ad6126b77fe51..e27ca11285a50 100644 --- a/matlab/test/arrow/array/tBooleanArray.m +++ b/matlab/test/arrow/array/tBooleanArray.m @@ -22,7 +22,7 @@ MatlabArrayFcn = @logical MatlabConversionFcn = @logical NullSubstitutionValue = false - ArrowType = arrow.type.boolean + ArrowType = arrow.boolean end methods(TestClassSetup) diff --git a/matlab/test/arrow/array/tFloat32Array.m b/matlab/test/arrow/array/tFloat32Array.m index e8655c7781ceb..f007e2b422d6e 100644 --- a/matlab/test/arrow/array/tFloat32Array.m +++ b/matlab/test/arrow/array/tFloat32Array.m @@ -24,7 +24,7 @@ MaxValue = realmax("single") MinValue = realmin("single") NullSubstitutionValue = single(NaN) - ArrowType = arrow.type.float32 + ArrowType = arrow.float32 end methods(Test) diff --git a/matlab/test/arrow/array/tFloat64Array.m b/matlab/test/arrow/array/tFloat64Array.m index a01eef73883b6..9b30ec8f25d49 100755 --- a/matlab/test/arrow/array/tFloat64Array.m +++ b/matlab/test/arrow/array/tFloat64Array.m @@ -24,7 +24,7 @@ MaxValue = realmax("double") MinValue = realmin("double") NullSubstitutionValue = NaN - ArrowType = arrow.type.float64 + ArrowType = arrow.float64 end methods(Test) diff --git a/matlab/test/arrow/array/tInt16Array.m b/matlab/test/arrow/array/tInt16Array.m index 466dfaf9c4d7f..9cb5fdc1d1049 100644 --- a/matlab/test/arrow/array/tInt16Array.m +++ b/matlab/test/arrow/array/tInt16Array.m @@ -24,7 +24,7 @@ MaxValue = intmax("int16") MinValue = intmin("int16") NullSubstitutionValue = int16(0) - ArrowType = arrow.type.int16 + ArrowType = arrow.int16 end end diff --git a/matlab/test/arrow/array/tInt32Array.m b/matlab/test/arrow/array/tInt32Array.m index b8334e97ccb9a..b45705592d714 100644 --- a/matlab/test/arrow/array/tInt32Array.m +++ b/matlab/test/arrow/array/tInt32Array.m @@ -24,6 +24,6 @@ MaxValue = intmax("int32") MinValue = intmin("int32") NullSubstitutionValue = int32(0) - ArrowType = arrow.type.int32 + ArrowType = arrow.int32 end end diff --git a/matlab/test/arrow/array/tInt64Array.m b/matlab/test/arrow/array/tInt64Array.m index a877cb2564fe9..0b38f58547cce 100644 --- a/matlab/test/arrow/array/tInt64Array.m +++ b/matlab/test/arrow/array/tInt64Array.m @@ -24,6 +24,6 @@ MaxValue = intmax("int64") MinValue = intmin("int64") NullSubstitutionValue = int64(0) - ArrowType = arrow.type.int64 + ArrowType = arrow.int64 end end diff --git a/matlab/test/arrow/array/tInt8Array.m b/matlab/test/arrow/array/tInt8Array.m index dbd6e74ea7f8f..8ce8e4e9b2d14 100644 --- a/matlab/test/arrow/array/tInt8Array.m +++ b/matlab/test/arrow/array/tInt8Array.m @@ -24,7 +24,7 @@ MaxValue = intmax("int8") MinValue = intmin("int8") NullSubstitutionValue = int8(0) - ArrowType = arrow.type.int8 + ArrowType = arrow.int8 end end diff --git a/matlab/test/arrow/array/tStringArray.m b/matlab/test/arrow/array/tStringArray.m index 792d7599816d5..dbb2adca0ce5b 100644 --- a/matlab/test/arrow/array/tStringArray.m +++ b/matlab/test/arrow/array/tStringArray.m @@ -22,7 +22,7 @@ MatlabArrayFcn = @string MatlabConversionFcn = @string NullSubstitutionValue = string(missing) - ArrowType = arrow.type.string + ArrowType = arrow.string end methods(TestClassSetup) diff --git a/matlab/test/arrow/array/tUInt16Array.m b/matlab/test/arrow/array/tUInt16Array.m index eed53c7882b47..705d6eabc0b7b 100644 --- a/matlab/test/arrow/array/tUInt16Array.m +++ b/matlab/test/arrow/array/tUInt16Array.m @@ -24,6 +24,6 @@ MaxValue = intmax("uint16") MinValue = intmin("uint16") NullSubstitutionValue = uint16(0) - ArrowType = arrow.type.uint16 + ArrowType = arrow.uint16 end end diff --git a/matlab/test/arrow/array/tUInt32Array.m b/matlab/test/arrow/array/tUInt32Array.m index b5e1970cbcc96..267a687738e44 100644 --- a/matlab/test/arrow/array/tUInt32Array.m +++ b/matlab/test/arrow/array/tUInt32Array.m @@ -24,6 +24,6 @@ MaxValue = intmax("uint32") MinValue = intmin("uint32") NullSubstitutionValue = uint32(0) - ArrowType = arrow.type.uint32 + ArrowType = arrow.uint32 end end diff --git a/matlab/test/arrow/array/tUInt64Array.m b/matlab/test/arrow/array/tUInt64Array.m index 6cd2c9cba6911..b1a23a004de69 100644 --- a/matlab/test/arrow/array/tUInt64Array.m +++ b/matlab/test/arrow/array/tUInt64Array.m @@ -24,6 +24,6 @@ MaxValue = intmax("uint64") MinValue = intmin("uint64") NullSubstitutionValue = uint64(0) - ArrowType = arrow.type.uint64 + ArrowType = arrow.uint64 end end diff --git a/matlab/test/arrow/array/tUInt8Array.m b/matlab/test/arrow/array/tUInt8Array.m index 68365958bc683..3db79f8c0b16d 100644 --- a/matlab/test/arrow/array/tUInt8Array.m +++ b/matlab/test/arrow/array/tUInt8Array.m @@ -24,6 +24,6 @@ MaxValue = intmax("uint8") MinValue = intmin("uint8") NullSubstitutionValue = uint8(0) - ArrowType = arrow.type.uint8 + ArrowType = arrow.uint8 end end diff --git a/matlab/test/arrow/type/tBooleanType.m b/matlab/test/arrow/type/tBooleanType.m index 94de09a3e58f1..eaa1c280d5355 100644 --- a/matlab/test/arrow/type/tBooleanType.m +++ b/matlab/test/arrow/type/tBooleanType.m @@ -17,7 +17,7 @@ % Test class for arrow.type.BooleanType properties - ArrowType = arrow.type.boolean + ArrowType = arrow.boolean TypeID = arrow.type.ID.Boolean BitWidth = int32(1) ClassName = "arrow.type.BooleanType" diff --git a/matlab/test/arrow/type/tField.m b/matlab/test/arrow/type/tField.m index 9f0a8851591ee..77a05bbe39513 100644 --- a/matlab/test/arrow/type/tField.m +++ b/matlab/test/arrow/type/tField.m @@ -19,7 +19,7 @@ methods(Test) function TestBasic(testCase) name = "A"; - type = arrow.type.uint64; + type = arrow.uint64; field = arrow.field(name, type); testCase.verifyEqual(field.Name, name); @@ -29,19 +29,19 @@ function TestBasic(testCase) function TestSupportedTypes(testCase) name = "name"; supportedTypes = { ... - arrow.type.uint8, ... - arrow.type.uint16, ... - arrow.type.uint32, ... - arrow.type.uint64, ... - arrow.type.int8, ... - arrow.type.int16, ... - arrow.type.int32, ... - arrow.type.int64, ... - arrow.type.boolean, ... - arrow.type.float32, ... - arrow.type.float64, ... - arrow.type.string, ... - arrow.type.timestamp, ... + arrow.uint8, ... + arrow.uint16, ... + arrow.uint32, ... + arrow.uint64, ... + arrow.int8, ... + arrow.int16, ... + arrow.int32, ... + arrow.int64, ... + arrow.boolean, ... + arrow.float32, ... + arrow.float64, ... + arrow.string, ... + arrow.timestamp, ... }; for ii = 1:numel(supportedTypes) supportedType = supportedTypes{ii}; @@ -56,7 +56,7 @@ function TestNameUnicode(testCase) tree = "🌲"; mango = "🥭"; - type = arrow.type.uint64; + type = arrow.uint64; field = arrow.field(smiley, type); testCase.verifyEqual(field.Name, smiley); @@ -75,13 +75,13 @@ function TestNameUnicode(testCase) function TestErrorIfNameStringMissing(testCase) name = string(missing); - type = arrow.type.uint64; + type = arrow.uint64; testCase.verifyError(@() arrow.field(name, type), "MATLAB:validators:mustBeNonmissing"); end function TestNameEmptyString(testCase) name = ""; - type = arrow.type.uint64; + type = arrow.uint64; field = arrow.field(name, type); testCase.verifyEqual(field.Name, name); @@ -90,7 +90,7 @@ function TestNameEmptyString(testCase) function TestNameCharVector(testCase) name = 'ABC'; - type = arrow.type.uint64; + type = arrow.uint64; field = arrow.field(name, type); testCase.verifyEqual(field.Name, string(name)); @@ -99,7 +99,7 @@ function TestNameCharVector(testCase) function TestNameNumber(testCase) name = 123; - type = arrow.type.uint64; + type = arrow.uint64; field = arrow.field(name, type); testCase.verifyEqual(field.Name, string(123)); @@ -114,17 +114,17 @@ function TestArrowTypeUnsupportedInput(testCase) function TestNameUnsupportedInput(testCase) name = table(); - type = arrow.type.uint64; + type = arrow.uint64; testCase.verifyError(@() arrow.field(name, type), "MATLAB:validation:UnableToConvert"); end function TestImmutableProperties(testCase) name = "A"; - type = arrow.type.uint64; + type = arrow.uint64; field = arrow.field(name, type); testCase.verifyError(@() setfield(field, "Name", "NewValue"), "MATLAB:class:noSetMethod") - testCase.verifyError(@() setfield(field, "Type", arrow.type.boolean), "MATLAB:class:noSetMethod") + testCase.verifyError(@() setfield(field, "Type", arrow.boolean), "MATLAB:class:noSetMethod") end end diff --git a/matlab/test/arrow/type/tFloat32Type.m b/matlab/test/arrow/type/tFloat32Type.m index c54fcfd32809b..1837c39a72ed8 100644 --- a/matlab/test/arrow/type/tFloat32Type.m +++ b/matlab/test/arrow/type/tFloat32Type.m @@ -17,7 +17,7 @@ % Test class for arrow.type.Float32Type properties - ArrowType = arrow.type.float32 + ArrowType = arrow.float32 TypeID = arrow.type.ID.Float32 BitWidth = int32(32) ClassName = "arrow.type.Float32Type" diff --git a/matlab/test/arrow/type/tFloat64Type.m b/matlab/test/arrow/type/tFloat64Type.m index 6b5648dfc10e3..8387a4bf5807b 100644 --- a/matlab/test/arrow/type/tFloat64Type.m +++ b/matlab/test/arrow/type/tFloat64Type.m @@ -17,7 +17,7 @@ % Test class for arrow.type.Float64Type properties - ArrowType = arrow.type.float64 + ArrowType = arrow.float64 TypeID = arrow.type.ID.Float64 BitWidth = int32(64) ClassName = "arrow.type.Float64Type" diff --git a/matlab/test/arrow/type/tInt16Type.m b/matlab/test/arrow/type/tInt16Type.m index a929ba688b5cd..9b741a32956f5 100644 --- a/matlab/test/arrow/type/tInt16Type.m +++ b/matlab/test/arrow/type/tInt16Type.m @@ -17,7 +17,7 @@ % Test class for arrow.type.Int16Type properties - ArrowType = arrow.type.int16 + ArrowType = arrow.int16 TypeID = arrow.type.ID.Int16 BitWidth = int32(16) ClassName = "arrow.type.Int16Type" diff --git a/matlab/test/arrow/type/tInt32Type.m b/matlab/test/arrow/type/tInt32Type.m index 6d59b5454e7fc..9724f9a4a6a96 100644 --- a/matlab/test/arrow/type/tInt32Type.m +++ b/matlab/test/arrow/type/tInt32Type.m @@ -17,7 +17,7 @@ % Test class for arrow.type.Int32Type properties - ArrowType = arrow.type.int32 + ArrowType = arrow.int32 TypeID = arrow.type.ID.Int32 BitWidth = int32(32) ClassName = "arrow.type.Int32Type" diff --git a/matlab/test/arrow/type/tInt64Type.m b/matlab/test/arrow/type/tInt64Type.m index 6ff0d2b07cbac..2acb5fd2d3f1f 100644 --- a/matlab/test/arrow/type/tInt64Type.m +++ b/matlab/test/arrow/type/tInt64Type.m @@ -17,7 +17,7 @@ % Test class for arrow.type.Int64Type properties - ArrowType = arrow.type.int64 + ArrowType = arrow.int64 TypeID = arrow.type.ID.Int64 BitWidth = int32(64) ClassName = "arrow.type.Int64Type" diff --git a/matlab/test/arrow/type/tInt8Type.m b/matlab/test/arrow/type/tInt8Type.m index 396be3a3f715a..15e2629bc4d68 100644 --- a/matlab/test/arrow/type/tInt8Type.m +++ b/matlab/test/arrow/type/tInt8Type.m @@ -17,7 +17,7 @@ % Test class for arrow.type.Int8Type properties - ArrowType = arrow.type.int8 + ArrowType = arrow.int8 TypeID = arrow.type.ID.Int8 BitWidth = int32(8) ClassName = "arrow.type.Int8Type" diff --git a/matlab/test/arrow/type/tStringType.m b/matlab/test/arrow/type/tStringType.m index 057ffd5426345..e52c2cb1cba0b 100644 --- a/matlab/test/arrow/type/tStringType.m +++ b/matlab/test/arrow/type/tStringType.m @@ -19,14 +19,14 @@ methods (Test) function Basic(tc) - type = arrow.type.string; + type = arrow.string; className = string(class(type)); tc.verifyEqual(className, "arrow.type.StringType"); tc.verifyEqual(type.ID, arrow.type.ID.String); end function NumFields(tc) - type = arrow.type.string; + type = arrow.string; tc.verifyEqual(type.NumFields, int32(0)); end diff --git a/matlab/test/arrow/type/tTimestampType.m b/matlab/test/arrow/type/tTimestampType.m index fa893d2d930de..8fd78854acb06 100644 --- a/matlab/test/arrow/type/tTimestampType.m +++ b/matlab/test/arrow/type/tTimestampType.m @@ -17,7 +17,7 @@ % Test class for arrow.type.TimestampType properties - ArrowType = arrow.type.timestamp + ArrowType = arrow.timestamp TypeID = arrow.type.ID.Timestamp BitWidth = int32(64) ClassName = "arrow.type.TimestampType" @@ -32,7 +32,7 @@ function TestClass(testCase) function DefaultTimeUnit(testCase) % Verify the default TimeUnit is Microsecond - type = arrow.type.timestamp; + type = arrow.timestamp; actualUnit = type.TimeUnit; expectedUnit = arrow.type.TimeUnit.Microsecond; testCase.verifyEqual(actualUnit, expectedUnit); @@ -40,7 +40,7 @@ function DefaultTimeUnit(testCase) function DefaultTimeZone(testCase) % Verify the default TimeZone is "" - type = arrow.type.timestamp; + type = arrow.timestamp; actualTimezone = type.TimeZone; expectedTimezone = ""; testCase.verifyEqual(actualTimezone, expectedTimezone); @@ -53,7 +53,7 @@ function SupplyTimeUnitEnum(testCase) TimeUnit.Microsecond, TimeUnit.Nanosecond]; for unit = expectedUnit - type = timestamp(TimeUnit=unit); + type = arrow.timestamp(TimeUnit=unit); testCase.verifyEqual(type.TimeUnit, unit); end end @@ -67,42 +67,42 @@ function SupplyTimeUnitString(testCase) TimeUnit.Microsecond, TimeUnit.Nanosecond]; for ii = 1:numel(unitString) - type = timestamp(TimeUnit=unitString(ii)); + type = arrow.timestamp(TimeUnit=unitString(ii)); testCase.verifyEqual(type.TimeUnit, expectedUnit(ii)); end end function SupplyTimeZone(testCase) % Supply the TimeZone. - type = arrow.type.timestamp(TimeZone="America/New_York"); + type = arrow.timestamp(TimeZone="America/New_York"); testCase.verifyEqual(type.TimeZone, "America/New_York"); end function ErrorIfMissingStringTimeZone(testCase) - fcn = @() arrow.type.timestamp(TimeZone=string(missing)); + fcn = @() arrow.timestamp(TimeZone=string(missing)); testCase.verifyError(fcn, "MATLAB:validators:mustBeNonmissing"); end function ErrorIfTimeZoneIsNonScalar(testCase) - fcn = @() arrow.type.timestamp(TimeZone=["a", "b"]); + fcn = @() arrow.timestamp(TimeZone=["a", "b"]); testCase.verifyError(fcn, "MATLAB:validation:IncompatibleSize"); - fcn = @() arrow.type.timestamp(TimeZone=strings(0, 0)); + fcn = @() arrow.timestamp(TimeZone=strings(0, 0)); testCase.verifyError(fcn, "MATLAB:validation:IncompatibleSize"); end function ErrorIfAmbiguousTimeUnit(testCase) - fcn = @() arrow.type.timestamp(TimeUnit="mi"); + fcn = @() arrow.timestamp(TimeUnit="mi"); testCase.verifyError(fcn, "MATLAB:validation:UnableToConvert"); end function ErrorIfTimeUnitIsNonScalar(testCase) units = [arrow.type.TimeUnit.Second; arrow.type.TimeUnit.Millisecond]; - fcn = @() arrow.type.timestamp(TimeZone=units); + fcn = @() arrow.timestamp(TimeZone=units); testCase.verifyError(fcn, "MATLAB:validation:IncompatibleSize"); units = ["second" "millisecond"]; - fcn = @() arrow.type.timestamp(TimeZone=units); + fcn = @() arrow.timestamp(TimeZone=units); testCase.verifyError(fcn, "MATLAB:validation:IncompatibleSize"); end end diff --git a/matlab/test/arrow/type/tUInt16Type.m b/matlab/test/arrow/type/tUInt16Type.m index ede66f6324691..8a803dc0a7888 100644 --- a/matlab/test/arrow/type/tUInt16Type.m +++ b/matlab/test/arrow/type/tUInt16Type.m @@ -17,7 +17,7 @@ % Test class for arrow.type.UInt16Type properties - ArrowType = arrow.type.uint16 + ArrowType = arrow.uint16 TypeID = arrow.type.ID.UInt16 BitWidth = int32(16) ClassName = "arrow.type.UInt16Type" diff --git a/matlab/test/arrow/type/tUInt32Type.m b/matlab/test/arrow/type/tUInt32Type.m index def24c76ceb76..019b8ce26929d 100644 --- a/matlab/test/arrow/type/tUInt32Type.m +++ b/matlab/test/arrow/type/tUInt32Type.m @@ -17,7 +17,7 @@ % Test class for arrow.type.UInt32Type properties - ArrowType = arrow.type.uint32 + ArrowType = arrow.uint32 TypeID = arrow.type.ID.UInt32 BitWidth = int32(32) ClassName = "arrow.type.UInt32Type" diff --git a/matlab/test/arrow/type/tUInt64Type.m b/matlab/test/arrow/type/tUInt64Type.m index 9228e1cc504d6..8287bb40d0052 100644 --- a/matlab/test/arrow/type/tUInt64Type.m +++ b/matlab/test/arrow/type/tUInt64Type.m @@ -17,7 +17,7 @@ % Test class for arrow.type.UInt64Type properties - ArrowType = arrow.type.uint64 + ArrowType = arrow.uint64 TypeID = arrow.type.ID.UInt64 BitWidth = int32(64) ClassName = "arrow.type.UInt64Type" diff --git a/matlab/test/arrow/type/tUInt8Type.m b/matlab/test/arrow/type/tUInt8Type.m index eec3aa5fdec25..1ff203c862aeb 100644 --- a/matlab/test/arrow/type/tUInt8Type.m +++ b/matlab/test/arrow/type/tUInt8Type.m @@ -17,7 +17,7 @@ % Test class for arrow.type.UInt64Type properties - ArrowType = arrow.type.uint8 + ArrowType = arrow.uint8 TypeID = arrow.type.ID.UInt8 BitWidth = int32(8) ClassName = "arrow.type.UInt8Type" From 0513097ca311550b70ab91e7d1cbc07e3b64ba34 Mon Sep 17 00:00:00 2001 From: sgilmore10 <74676073+sgilmore10@users.noreply.github.com> Date: Wed, 26 Jul 2023 12:16:56 -0400 Subject: [PATCH 054/749] GH-36735 : Add `TimeUnit` and `TimeZone` to the `arrow.type.TimestampType` display (#36871) ### Rationale for this change The `arrow.type.TimestampType` display should include the `TimeUnit` and `TimeZone` properties. Right now we only display the `ID` property: ```matlab >> type = arrow.type.timestamp(TimeUnit="Second", TimeZone="America/Anchorage") type = TimestampType with properties: ID: Timestamp ``` We should show the other two properties in the display. ### What changes are included in this PR? Modified the display of `TimestampType`: ```matlab >> type = arrow.type.timestamp(TimeUnit="Second", TimeZone="America/Anchorage") type = TimestampType with properties: ID: Timestamp TimeUnit: Second TimeZone: "America/Anchorage" ``` Now `TimeUnit` and `TimeZone` are included. ### Are these changes tested? Added a test case to `tTimestampType` called `Display`. It verifies `TimestampType` objects are displayed correctly in the Command Window. ### Are there any user-facing changes? Yes, users will see the new display. * Closes: #36735 Authored-by: Sarah Gilmore Signed-off-by: Kevin Gurney --- .../src/matlab/+arrow/+type/TimestampType.m | 7 +++++ matlab/test/arrow/type/tTimestampType.m | 27 +++++++++++++++++-- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/matlab/src/matlab/+arrow/+type/TimestampType.m b/matlab/src/matlab/+arrow/+type/TimestampType.m index a5a376f8bc3b3..b3d34f31b7d1e 100644 --- a/matlab/src/matlab/+arrow/+type/TimestampType.m +++ b/matlab/src/matlab/+arrow/+type/TimestampType.m @@ -39,4 +39,11 @@ tz = obj.Proxy.timeZone(); end end + + methods (Access=protected) + function group = getPropertyGroups(~) + targets = ["ID" "TimeUnit" "TimeZone"]; + group = matlab.mixin.util.PropertyGroup(targets); + end + end end \ No newline at end of file diff --git a/matlab/test/arrow/type/tTimestampType.m b/matlab/test/arrow/type/tTimestampType.m index 8fd78854acb06..deee984e4b911 100644 --- a/matlab/test/arrow/type/tTimestampType.m +++ b/matlab/test/arrow/type/tTimestampType.m @@ -87,7 +87,7 @@ function ErrorIfTimeZoneIsNonScalar(testCase) fcn = @() arrow.timestamp(TimeZone=["a", "b"]); testCase.verifyError(fcn, "MATLAB:validation:IncompatibleSize"); - fcn = @() arrow.timestamp(TimeZone=strings(0, 0)); + fcn = @() arrow.timestamp(TimeZone=strings(0, 0)); testCase.verifyError(fcn, "MATLAB:validation:IncompatibleSize"); end @@ -105,5 +105,28 @@ function ErrorIfTimeUnitIsNonScalar(testCase) fcn = @() arrow.timestamp(TimeZone=units); testCase.verifyError(fcn, "MATLAB:validation:IncompatibleSize"); end + + function Display(testCase) + % Verify the display of TimestampType objects. + % + % Example: + % + % TimestampType with properties: + % + % ID: Timestamp + % TimeUnit: Second + % TimeZone: "America/Anchorage" + % + type = arrow.timestamp(TimeUnit="Second", TimeZone="America/Anchorage"); %#ok + classnameLink = "TimestampType"; + header = " " + classnameLink + " with properties:" + newline; + body = strjust(pad(["ID:"; "TimeUnit:"; "TimeZone:"])); + body = body + " " + ["Timestamp"; "Second"; """America/Anchorage"""]; + body = " " + body; + footer = string(newline); + expectedDisplay = char(strjoin([header body' footer], newline)); + actualDisplay = evalc('disp(type)'); + testCase.verifyEqual(actualDisplay, expectedDisplay); + end end -end \ No newline at end of file +end From 7c0451c71ef68840b4642e53857b522160c075f4 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Wed, 26 Jul 2023 12:49:53 -0400 Subject: [PATCH 055/749] GH-36893: [Go][Flight] Expose underlying protobuf definitions (#36895) ### What changes are included in this PR? Shifting generated protobuf code for flight/flightsql to have the path `arrow/flight/gen/flight` rather than `arrow/flight/internal/flight` so that they are exposed and exported. * Closes: #36893 Authored-by: Matt Topol Signed-off-by: Matt Topol --- dev/release/rat_exclude_files.txt | 3 +- format/Flight.proto | 2 +- format/FlightSql.proto | 2 +- go/arrow/flight/client.go | 2 +- go/arrow/flight/flightsql/client.go | 2 +- go/arrow/flight/flightsql/client_test.go | 2 +- go/arrow/flight/flightsql/server.go | 2 +- go/arrow/flight/flightsql/server_test.go | 2 +- go/arrow/flight/flightsql/types.go | 2 +- go/arrow/flight/gen.go | 4 +- .../{internal => gen}/flight/Flight.pb.go | 71 ++- .../{internal => gen}/flight/FlightSql.pb.go | 575 ++++++++++-------- .../flight/Flight_grpc.pb.go | 46 +- go/arrow/flight/server.go | 2 +- 14 files changed, 430 insertions(+), 287 deletions(-) rename go/arrow/flight/{internal => gen}/flight/Flight.pb.go (98%) rename go/arrow/flight/{internal => gen}/flight/FlightSql.pb.go (94%) rename go/arrow/flight/{internal => gen}/flight/Flight_grpc.pb.go (93%) diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index f61c217760f61..7bdb692d048e9 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -63,8 +63,7 @@ docs/requirements.txt go.work.sum go/go.sum go/arrow/Gopkg.lock -go/arrow/flight/internal/flight/Flight.pb.go -go/arrow/flight/internal/flight/Flight_grpc.pb.go +go/arrow/flight/gen/flight/*.pb.go go/arrow/internal/cpu/* go/arrow/type_string.go go/arrow/cdata/test/go.sum diff --git a/format/Flight.proto b/format/Flight.proto index 107e95765406e..b5d23f6f7e6d2 100644 --- a/format/Flight.proto +++ b/format/Flight.proto @@ -20,7 +20,7 @@ syntax = "proto3"; import "google/protobuf/timestamp.proto"; option java_package = "org.apache.arrow.flight.impl"; -option go_package = "github.com/apache/arrow/go/arrow/flight/internal/flight"; +option go_package = "github.com/apache/arrow/go/arrow/flight/gen/flight"; option csharp_namespace = "Apache.Arrow.Flight.Protocol"; package arrow.flight.protocol; diff --git a/format/FlightSql.proto b/format/FlightSql.proto index 48c2d94a11f42..3c9a719f1275f 100644 --- a/format/FlightSql.proto +++ b/format/FlightSql.proto @@ -20,7 +20,7 @@ syntax = "proto3"; import "google/protobuf/descriptor.proto"; option java_package = "org.apache.arrow.flight.sql.impl"; -option go_package = "github.com/apache/arrow/go/arrow/flight/internal/flight"; +option go_package = "github.com/apache/arrow/go/arrow/flight/gen/flight"; package arrow.flight.protocol.sql; /* diff --git a/go/arrow/flight/client.go b/go/arrow/flight/client.go index 31ffc26cfd35a..1de5fc47f5f28 100644 --- a/go/arrow/flight/client.go +++ b/go/arrow/flight/client.go @@ -26,7 +26,7 @@ import ( "strings" "sync/atomic" - "github.com/apache/arrow/go/v13/arrow/flight/internal/flight" + "github.com/apache/arrow/go/v13/arrow/flight/gen/flight" "google.golang.org/grpc" "google.golang.org/grpc/codes" "google.golang.org/grpc/metadata" diff --git a/go/arrow/flight/flightsql/client.go b/go/arrow/flight/flightsql/client.go index 76c9f6fb01d32..f4cd6ee7ce56f 100644 --- a/go/arrow/flight/flightsql/client.go +++ b/go/arrow/flight/flightsql/client.go @@ -25,7 +25,7 @@ import ( "github.com/apache/arrow/go/v13/arrow" "github.com/apache/arrow/go/v13/arrow/array" "github.com/apache/arrow/go/v13/arrow/flight" - pb "github.com/apache/arrow/go/v13/arrow/flight/internal/flight" + pb "github.com/apache/arrow/go/v13/arrow/flight/gen/flight" "github.com/apache/arrow/go/v13/arrow/ipc" "github.com/apache/arrow/go/v13/arrow/memory" "google.golang.org/grpc" diff --git a/go/arrow/flight/flightsql/client_test.go b/go/arrow/flight/flightsql/client_test.go index 2b57596fb188c..1532ef5f32f54 100644 --- a/go/arrow/flight/flightsql/client_test.go +++ b/go/arrow/flight/flightsql/client_test.go @@ -26,7 +26,7 @@ import ( "github.com/apache/arrow/go/v13/arrow/array" "github.com/apache/arrow/go/v13/arrow/flight" "github.com/apache/arrow/go/v13/arrow/flight/flightsql" - pb "github.com/apache/arrow/go/v13/arrow/flight/internal/flight" + pb "github.com/apache/arrow/go/v13/arrow/flight/gen/flight" "github.com/apache/arrow/go/v13/arrow/memory" "github.com/stretchr/testify/mock" "github.com/stretchr/testify/suite" diff --git a/go/arrow/flight/flightsql/server.go b/go/arrow/flight/flightsql/server.go index ee457ad7a8bca..48c0314fa6490 100644 --- a/go/arrow/flight/flightsql/server.go +++ b/go/arrow/flight/flightsql/server.go @@ -24,7 +24,7 @@ import ( "github.com/apache/arrow/go/v13/arrow/array" "github.com/apache/arrow/go/v13/arrow/flight" "github.com/apache/arrow/go/v13/arrow/flight/flightsql/schema_ref" - pb "github.com/apache/arrow/go/v13/arrow/flight/internal/flight" + pb "github.com/apache/arrow/go/v13/arrow/flight/gen/flight" "github.com/apache/arrow/go/v13/arrow/internal/debug" "github.com/apache/arrow/go/v13/arrow/ipc" "github.com/apache/arrow/go/v13/arrow/memory" diff --git a/go/arrow/flight/flightsql/server_test.go b/go/arrow/flight/flightsql/server_test.go index 9ced8e0ed6cdf..43a23bb7e9ac6 100644 --- a/go/arrow/flight/flightsql/server_test.go +++ b/go/arrow/flight/flightsql/server_test.go @@ -26,7 +26,7 @@ import ( "github.com/apache/arrow/go/v13/arrow/array" "github.com/apache/arrow/go/v13/arrow/flight" "github.com/apache/arrow/go/v13/arrow/flight/flightsql" - pb "github.com/apache/arrow/go/v13/arrow/flight/internal/flight" + pb "github.com/apache/arrow/go/v13/arrow/flight/gen/flight" "github.com/apache/arrow/go/v13/arrow/memory" "github.com/stretchr/testify/suite" "google.golang.org/grpc" diff --git a/go/arrow/flight/flightsql/types.go b/go/arrow/flight/flightsql/types.go index 72de81115a77a..34db36b44df2a 100644 --- a/go/arrow/flight/flightsql/types.go +++ b/go/arrow/flight/flightsql/types.go @@ -17,7 +17,7 @@ package flightsql import ( - pb "github.com/apache/arrow/go/v13/arrow/flight/internal/flight" + pb "github.com/apache/arrow/go/v13/arrow/flight/gen/flight" "google.golang.org/protobuf/proto" "google.golang.org/protobuf/types/known/anypb" ) diff --git a/go/arrow/flight/gen.go b/go/arrow/flight/gen.go index 4109059af8ed1..cfdd0e036703a 100644 --- a/go/arrow/flight/gen.go +++ b/go/arrow/flight/gen.go @@ -16,5 +16,5 @@ package flight -//go:generate protoc -I../../../format --go_out=./internal/flight --go-grpc_out=./internal/flight --go_opt=paths=source_relative --go-grpc_opt=paths=source_relative Flight.proto -//go:generate protoc --experimental_allow_proto3_optional -I../../../format --go_out=./internal/flight --go-grpc_out=./internal/flight --go_opt=paths=source_relative --go-grpc_opt=paths=source_relative FlightSql.proto +//go:generate protoc -I../../../format --go_out=./gen/flight --go-grpc_out=./gen/flight --go_opt=paths=source_relative --go-grpc_opt=paths=source_relative Flight.proto +//go:generate protoc --experimental_allow_proto3_optional -I../../../format --go_out=./gen/flight --go-grpc_out=./gen/flight --go_opt=paths=source_relative --go-grpc_opt=paths=source_relative FlightSql.proto diff --git a/go/arrow/flight/internal/flight/Flight.pb.go b/go/arrow/flight/gen/flight/Flight.pb.go similarity index 98% rename from go/arrow/flight/internal/flight/Flight.pb.go rename to go/arrow/flight/gen/flight/Flight.pb.go index 7b4d1e2fd9298..f91d762014603 100644 --- a/go/arrow/flight/internal/flight/Flight.pb.go +++ b/go/arrow/flight/gen/flight/Flight.pb.go @@ -18,15 +18,15 @@ // Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.28.1 -// protoc v3.21.12 +// protoc v3.12.4 // source: Flight.proto package flight import ( + timestamp "github.com/golang/protobuf/ptypes/timestamp" protoreflect "google.golang.org/protobuf/reflect/protoreflect" protoimpl "google.golang.org/protobuf/runtime/protoimpl" - timestamppb "google.golang.org/protobuf/types/known/timestamppb" reflect "reflect" sync "sync" ) @@ -38,6 +38,7 @@ const ( _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) ) +// // The result of a cancel operation. // // This is used by CancelFlightInfoResult.status. @@ -102,17 +103,19 @@ func (CancelStatus) EnumDescriptor() ([]byte, []int) { return file_Flight_proto_rawDescGZIP(), []int{0} } +// // Describes what type of descriptor is defined. type FlightDescriptor_DescriptorType int32 const ( // Protobuf pattern, not used. FlightDescriptor_UNKNOWN FlightDescriptor_DescriptorType = 0 + // // A named path that identifies a dataset. A path is composed of a string // or list of strings describing a particular dataset. This is conceptually - // - // similar to a path inside a filesystem. + // similar to a path inside a filesystem. FlightDescriptor_PATH FlightDescriptor_DescriptorType = 1 + // // An opaque command to generate a dataset. FlightDescriptor_CMD FlightDescriptor_DescriptorType = 2 ) @@ -158,14 +161,17 @@ func (FlightDescriptor_DescriptorType) EnumDescriptor() ([]byte, []int) { return file_Flight_proto_rawDescGZIP(), []int{12, 0} } +// // The request that a client provides to a server on handshake. type HandshakeRequest struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields + // // A defined protocol version ProtocolVersion uint64 `protobuf:"varint,1,opt,name=protocol_version,json=protocolVersion,proto3" json:"protocol_version,omitempty"` + // // Arbitrary auth/handshake info. Payload []byte `protobuf:"bytes,2,opt,name=payload,proto3" json:"payload,omitempty"` } @@ -221,8 +227,10 @@ type HandshakeResponse struct { sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields + // // A defined protocol version ProtocolVersion uint64 `protobuf:"varint,1,opt,name=protocol_version,json=protocolVersion,proto3" json:"protocol_version,omitempty"` + // // Arbitrary auth/handshake info. Payload []byte `protobuf:"bytes,2,opt,name=payload,proto3" json:"payload,omitempty"` } @@ -273,6 +281,7 @@ func (x *HandshakeResponse) GetPayload() []byte { return nil } +// // A message for doing simple auth. type BasicAuth struct { state protoimpl.MessageState @@ -367,6 +376,7 @@ func (*Empty) Descriptor() ([]byte, []int) { return file_Flight_proto_rawDescGZIP(), []int{3} } +// // Describes an available action, including both the name used for execution // along with a short description of the purpose of the action. type ActionType struct { @@ -424,6 +434,7 @@ func (x *ActionType) GetDescription() string { return "" } +// // A service specific expression that can be used to return a limited set // of available Arrow Flight streams. type Criteria struct { @@ -473,6 +484,7 @@ func (x *Criteria) GetExpression() []byte { return nil } +// // An opaque action specific for the service. type Action struct { state protoimpl.MessageState @@ -529,6 +541,7 @@ func (x *Action) GetBody() []byte { return nil } +// // The request of the CancelFlightInfo action. // // The request should be stored in Action.body. @@ -579,6 +592,7 @@ func (x *CancelFlightInfoRequest) GetInfo() *FlightInfo { return nil } +// // The request of the RenewFlightEndpoint action. // // The request should be stored in Action.body. @@ -629,6 +643,7 @@ func (x *RenewFlightEndpointRequest) GetEndpoint() *FlightEndpoint { return nil } +// // An opaque result returned after executing an action. type Result struct { state protoimpl.MessageState @@ -677,6 +692,7 @@ func (x *Result) GetBody() []byte { return nil } +// // The result of the CancelFlightInfo action. // // The result should be stored in Result.body. @@ -727,6 +743,7 @@ func (x *CancelFlightInfoResult) GetStatus() CancelStatus { return CancelStatus_CANCEL_STATUS_UNSPECIFIED } +// // Wrap the result of a getSchema call type SchemaResult struct { state protoimpl.MessageState @@ -734,10 +751,9 @@ type SchemaResult struct { unknownFields protoimpl.UnknownFields // The schema of the dataset in its IPC form: - // - // 4 bytes - an optional IPC_CONTINUATION_TOKEN prefix - // 4 bytes - the byte length of the payload - // a flatbuffer Message whose header is the Schema + // 4 bytes - an optional IPC_CONTINUATION_TOKEN prefix + // 4 bytes - the byte length of the payload + // a flatbuffer Message whose header is the Schema Schema []byte `protobuf:"bytes,1,opt,name=schema,proto3" json:"schema,omitempty"` } @@ -780,6 +796,7 @@ func (x *SchemaResult) GetSchema() []byte { return nil } +// // The name or tag for a Flight. May be used as a way to retrieve or generate // a flight or be used to expose a set of previously defined flights. type FlightDescriptor struct { @@ -788,9 +805,11 @@ type FlightDescriptor struct { unknownFields protoimpl.UnknownFields Type FlightDescriptor_DescriptorType `protobuf:"varint,1,opt,name=type,proto3,enum=arrow.flight.protocol.FlightDescriptor_DescriptorType" json:"type,omitempty"` + // // Opaque value used to express a command. Should only be defined when // type = CMD. Cmd []byte `protobuf:"bytes,2,opt,name=cmd,proto3" json:"cmd,omitempty"` + // // List of strings identifying a particular dataset. Should only be defined // when type = PATH. Path []string `protobuf:"bytes,3,rep,name=path,proto3" json:"path,omitempty"` @@ -849,6 +868,7 @@ func (x *FlightDescriptor) GetPath() []string { return nil } +// // The access coordinates for retrieval of a dataset. With a FlightInfo, a // consumer is able to determine how to retrieve a dataset. type FlightInfo struct { @@ -857,13 +877,14 @@ type FlightInfo struct { unknownFields protoimpl.UnknownFields // The schema of the dataset in its IPC form: - // - // 4 bytes - an optional IPC_CONTINUATION_TOKEN prefix - // 4 bytes - the byte length of the payload - // a flatbuffer Message whose header is the Schema + // 4 bytes - an optional IPC_CONTINUATION_TOKEN prefix + // 4 bytes - the byte length of the payload + // a flatbuffer Message whose header is the Schema Schema []byte `protobuf:"bytes,1,opt,name=schema,proto3" json:"schema,omitempty"` + // // The descriptor associated with this info. FlightDescriptor *FlightDescriptor `protobuf:"bytes,2,opt,name=flight_descriptor,json=flightDescriptor,proto3" json:"flight_descriptor,omitempty"` + // // A list of endpoints associated with the flight. To consume the // whole flight, all endpoints (and hence all Tickets) must be // consumed. Endpoints can be consumed in any order. @@ -883,13 +904,14 @@ type FlightInfo struct { // ordering is important for an application, an application must // choose one of them: // - // - An application requires that all clients must read data in - // returned endpoints order. - // - An application must return the all data in a single endpoint. + // * An application requires that all clients must read data in + // returned endpoints order. + // * An application must return the all data in a single endpoint. Endpoint []*FlightEndpoint `protobuf:"bytes,3,rep,name=endpoint,proto3" json:"endpoint,omitempty"` // Set these to -1 if unknown. TotalRecords int64 `protobuf:"varint,4,opt,name=total_records,json=totalRecords,proto3" json:"total_records,omitempty"` TotalBytes int64 `protobuf:"varint,5,opt,name=total_bytes,json=totalBytes,proto3" json:"total_bytes,omitempty"` + // // FlightEndpoints are in the same order as the data. Ordered bool `protobuf:"varint,6,opt,name=ordered,proto3" json:"ordered,omitempty"` } @@ -968,14 +990,17 @@ func (x *FlightInfo) GetOrdered() bool { return false } +// // A particular stream or split associated with a flight. type FlightEndpoint struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields + // // Token used to retrieve this stream. Ticket *Ticket `protobuf:"bytes,1,opt,name=ticket,proto3" json:"ticket,omitempty"` + // // A list of URIs where this ticket can be redeemed via DoGet(). // // If the list is empty, the expectation is that the ticket can only @@ -991,10 +1016,11 @@ type FlightEndpoint struct { // In other words, an application can use multiple locations to // represent redundant and/or load balanced services. Location []*Location `protobuf:"bytes,2,rep,name=location,proto3" json:"location,omitempty"` + // // Expiration time of this stream. If present, clients may assume // they can retry DoGet requests. Otherwise, it is // application-defined whether DoGet requests may be retried. - ExpirationTime *timestamppb.Timestamp `protobuf:"bytes,3,opt,name=expiration_time,json=expirationTime,proto3" json:"expiration_time,omitempty"` + ExpirationTime *timestamp.Timestamp `protobuf:"bytes,3,opt,name=expiration_time,json=expirationTime,proto3" json:"expiration_time,omitempty"` } func (x *FlightEndpoint) Reset() { @@ -1043,13 +1069,14 @@ func (x *FlightEndpoint) GetLocation() []*Location { return nil } -func (x *FlightEndpoint) GetExpirationTime() *timestamppb.Timestamp { +func (x *FlightEndpoint) GetExpirationTime() *timestamp.Timestamp { if x != nil { return x.ExpirationTime } return nil } +// // A location where a Flight service will accept retrieval of a particular // stream given a ticket. type Location struct { @@ -1099,6 +1126,7 @@ func (x *Location) GetUri() string { return "" } +// // An opaque identifier that the service can use to retrieve a particular // portion of a stream. // @@ -1151,19 +1179,24 @@ func (x *Ticket) GetTicket() []byte { return nil } +// // A batch of Arrow data as part of a stream of batches. type FlightData struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields + // // The descriptor of the data. This is only relevant when a client is // starting a new DoPut stream. FlightDescriptor *FlightDescriptor `protobuf:"bytes,1,opt,name=flight_descriptor,json=flightDescriptor,proto3" json:"flight_descriptor,omitempty"` + // // Header for message data as described in Message.fbs::Message. DataHeader []byte `protobuf:"bytes,2,opt,name=data_header,json=dataHeader,proto3" json:"data_header,omitempty"` + // // Application-defined metadata. AppMetadata []byte `protobuf:"bytes,3,opt,name=app_metadata,json=appMetadata,proto3" json:"app_metadata,omitempty"` + // // The actual batch of Arrow data. Preferably handled with minimal-copies // coming last in the definition to help with sidecar patterns (it is // expected that some implementations will fetch this field off the wire @@ -1231,7 +1264,7 @@ func (x *FlightData) GetDataBody() []byte { return nil } -// * +//* // The response message associated with the submission of a DoPut. type PutResult struct { state protoimpl.MessageState @@ -1502,7 +1535,7 @@ var file_Flight_proto_goTypes = []interface{}{ (*Ticket)(nil), // 18: arrow.flight.protocol.Ticket (*FlightData)(nil), // 19: arrow.flight.protocol.FlightData (*PutResult)(nil), // 20: arrow.flight.protocol.PutResult - (*timestamppb.Timestamp)(nil), // 21: google.protobuf.Timestamp + (*timestamp.Timestamp)(nil), // 21: google.protobuf.Timestamp } var file_Flight_proto_depIdxs = []int32{ 15, // 0: arrow.flight.protocol.CancelFlightInfoRequest.info:type_name -> arrow.flight.protocol.FlightInfo diff --git a/go/arrow/flight/internal/flight/FlightSql.pb.go b/go/arrow/flight/gen/flight/FlightSql.pb.go similarity index 94% rename from go/arrow/flight/internal/flight/FlightSql.pb.go rename to go/arrow/flight/gen/flight/FlightSql.pb.go index b61ac29066836..3b1ba232d3d12 100644 --- a/go/arrow/flight/internal/flight/FlightSql.pb.go +++ b/go/arrow/flight/gen/flight/FlightSql.pb.go @@ -18,15 +18,15 @@ // Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.28.1 -// protoc v3.21.12 +// protoc v3.12.4 // source: FlightSql.proto package flight import ( + descriptor "github.com/golang/protobuf/protoc-gen-go/descriptor" protoreflect "google.golang.org/protobuf/reflect/protoreflect" protoimpl "google.golang.org/protobuf/runtime/protoimpl" - descriptorpb "google.golang.org/protobuf/types/descriptorpb" reflect "reflect" sync "sync" ) @@ -48,27 +48,33 @@ const ( SqlInfo_FLIGHT_SQL_SERVER_VERSION SqlInfo = 1 // Retrieves a UTF-8 string with the Arrow format version of the Flight SQL Server. SqlInfo_FLIGHT_SQL_SERVER_ARROW_VERSION SqlInfo = 2 + // // Retrieves a boolean value indicating whether the Flight SQL Server is read only. // // Returns: // - false: if read-write // - true: if read only SqlInfo_FLIGHT_SQL_SERVER_READ_ONLY SqlInfo = 3 + // // Retrieves a boolean value indicating whether the Flight SQL Server supports executing // SQL queries. // // Note that the absence of this info (as opposed to a false value) does not necessarily // mean that SQL is not supported, as this property was not originally defined. SqlInfo_FLIGHT_SQL_SERVER_SQL SqlInfo = 4 + // // Retrieves a boolean value indicating whether the Flight SQL Server supports executing // Substrait plans. SqlInfo_FLIGHT_SQL_SERVER_SUBSTRAIT SqlInfo = 5 + // // Retrieves a string value indicating the minimum supported Substrait version, or null // if Substrait is not supported. SqlInfo_FLIGHT_SQL_SERVER_SUBSTRAIT_MIN_VERSION SqlInfo = 6 + // // Retrieves a string value indicating the maximum supported Substrait version, or null // if Substrait is not supported. SqlInfo_FLIGHT_SQL_SERVER_SUBSTRAIT_MAX_VERSION SqlInfo = 7 + // // Retrieves an int32 indicating whether the Flight SQL Server supports the // BeginTransaction/EndTransaction/BeginSavepoint/EndSavepoint actions. // @@ -78,51 +84,61 @@ const ( // // The possible values are listed in `SqlSupportedTransaction`. SqlInfo_FLIGHT_SQL_SERVER_TRANSACTION SqlInfo = 8 + // // Retrieves a boolean value indicating whether the Flight SQL Server supports explicit // query cancellation (the CancelQuery action). SqlInfo_FLIGHT_SQL_SERVER_CANCEL SqlInfo = 9 + // // Retrieves an int32 indicating the timeout (in milliseconds) for prepared statement handles. // // If 0, there is no timeout. Servers should reset the timeout when the handle is used in a command. SqlInfo_FLIGHT_SQL_SERVER_STATEMENT_TIMEOUT SqlInfo = 100 + // // Retrieves an int32 indicating the timeout (in milliseconds) for transactions, since transactions are not tied to a connection. // // If 0, there is no timeout. Servers should reset the timeout when the handle is used in a command. SqlInfo_FLIGHT_SQL_SERVER_TRANSACTION_TIMEOUT SqlInfo = 101 + // // Retrieves a boolean value indicating whether the Flight SQL Server supports CREATE and DROP of catalogs. // // Returns: // - false: if it doesn't support CREATE and DROP of catalogs. // - true: if it supports CREATE and DROP of catalogs. SqlInfo_SQL_DDL_CATALOG SqlInfo = 500 + // // Retrieves a boolean value indicating whether the Flight SQL Server supports CREATE and DROP of schemas. // // Returns: // - false: if it doesn't support CREATE and DROP of schemas. // - true: if it supports CREATE and DROP of schemas. SqlInfo_SQL_DDL_SCHEMA SqlInfo = 501 + // // Indicates whether the Flight SQL Server supports CREATE and DROP of tables. // // Returns: // - false: if it doesn't support CREATE and DROP of tables. // - true: if it supports CREATE and DROP of tables. SqlInfo_SQL_DDL_TABLE SqlInfo = 502 + // // Retrieves a int32 ordinal representing the case sensitivity of catalog, table, schema and table names. // // The possible values are listed in `arrow.flight.protocol.sql.SqlSupportedCaseSensitivity`. SqlInfo_SQL_IDENTIFIER_CASE SqlInfo = 503 // Retrieves a UTF-8 string with the supported character(s) used to surround a delimited identifier. SqlInfo_SQL_IDENTIFIER_QUOTE_CHAR SqlInfo = 504 + // // Retrieves a int32 describing the case sensitivity of quoted identifiers. // // The possible values are listed in `arrow.flight.protocol.sql.SqlSupportedCaseSensitivity`. SqlInfo_SQL_QUOTED_IDENTIFIER_CASE SqlInfo = 505 + // // Retrieves a boolean value indicating whether all tables are selectable. // // Returns: // - false: if not all tables are selectable or if none are; // - true: if all tables are selectable. SqlInfo_SQL_ALL_TABLES_ARE_SELECTABLE SqlInfo = 506 + // // Retrieves the null ordering. // // Returns a int32 ordinal for the null ordering being used, as described in @@ -138,15 +154,18 @@ const ( SqlInfo_SQL_SYSTEM_FUNCTIONS SqlInfo = 511 // Retrieves a UTF-8 string list with values of the supported datetime functions. SqlInfo_SQL_DATETIME_FUNCTIONS SqlInfo = 512 + // // Retrieves the UTF-8 string that can be used to escape wildcard characters. // This is the string that can be used to escape '_' or '%' in the catalog search parameters that are a pattern // (and therefore use one of the wildcard characters). // The '_' character represents any single character; the '%' character represents any sequence of zero or more // characters. SqlInfo_SQL_SEARCH_STRING_ESCAPE SqlInfo = 513 + // // Retrieves a UTF-8 string with all the "extra" characters that can be used in unquoted identifier names // (those beyond a-z, A-Z, 0-9 and _). SqlInfo_SQL_EXTRA_NAME_CHARACTERS SqlInfo = 514 + // // Retrieves a boolean value indicating whether column aliasing is supported. // If so, the SQL AS clause can be used to provide names for computed columns or to provide alias names for columns // as required. @@ -155,6 +174,7 @@ const ( // - false: if column aliasing is unsupported; // - true: if column aliasing is supported. SqlInfo_SQL_SUPPORTS_COLUMN_ALIASING SqlInfo = 515 + // // Retrieves a boolean value indicating whether concatenations between null and non-null values being // null are supported. // @@ -162,11 +182,13 @@ const ( // - false: if concatenations between null and non-null values being null are unsupported; // - true: if concatenations between null and non-null values being null are supported. SqlInfo_SQL_NULL_PLUS_NULL_IS_NULL SqlInfo = 516 + // // Retrieves a map where the key is the type to convert from and the value is a list with the types to convert to, // indicating the supported conversions. Each key and each item on the list value is a value to a predefined type on // SqlSupportsConvert enum. // The returned map will be: map> SqlInfo_SQL_SUPPORTS_CONVERT SqlInfo = 517 + // // Retrieves a boolean value indicating whether, when table correlation names are supported, // they are restricted to being different from the names of the tables. // @@ -174,6 +196,7 @@ const ( // - false: if table correlation names are unsupported; // - true: if table correlation names are supported. SqlInfo_SQL_SUPPORTS_TABLE_CORRELATION_NAMES SqlInfo = 518 + // // Retrieves a boolean value indicating whether, when table correlation names are supported, // they are restricted to being different from the names of the tables. // @@ -181,12 +204,14 @@ const ( // - false: if different table correlation names are unsupported; // - true: if different table correlation names are supported SqlInfo_SQL_SUPPORTS_DIFFERENT_TABLE_CORRELATION_NAMES SqlInfo = 519 + // // Retrieves a boolean value indicating whether expressions in ORDER BY lists are supported. // // Returns: // - false: if expressions in ORDER BY are unsupported; // - true: if expressions in ORDER BY are supported; SqlInfo_SQL_SUPPORTS_EXPRESSIONS_IN_ORDER_BY SqlInfo = 520 + // // Retrieves a boolean value indicating whether using a column that is not in the SELECT statement in a GROUP BY // clause is supported. // @@ -194,6 +219,7 @@ const ( // - false: if using a column that is not in the SELECT statement in a GROUP BY clause is unsupported; // - true: if using a column that is not in the SELECT statement in a GROUP BY clause is supported. SqlInfo_SQL_SUPPORTS_ORDER_BY_UNRELATED SqlInfo = 521 + // // Retrieves the supported GROUP BY commands; // // Returns an int32 bitmask value representing the supported commands. @@ -206,18 +232,21 @@ const ( // - return 3 (\b11) => [SQL_GROUP_BY_UNRELATED, SQL_GROUP_BY_BEYOND_SELECT]. // Valid GROUP BY types are described under `arrow.flight.protocol.sql.SqlSupportedGroupBy`. SqlInfo_SQL_SUPPORTED_GROUP_BY SqlInfo = 522 + // // Retrieves a boolean value indicating whether specifying a LIKE escape clause is supported. // // Returns: // - false: if specifying a LIKE escape clause is unsupported; // - true: if specifying a LIKE escape clause is supported. SqlInfo_SQL_SUPPORTS_LIKE_ESCAPE_CLAUSE SqlInfo = 523 + // // Retrieves a boolean value indicating whether columns may be defined as non-nullable. // // Returns: // - false: if columns cannot be defined as non-nullable; // - true: if columns may be defined as non-nullable. SqlInfo_SQL_SUPPORTS_NON_NULLABLE_COLUMNS SqlInfo = 524 + // // Retrieves the supported SQL grammar level as per the ODBC specification. // // Returns an int32 bitmask value representing the supported SQL grammar level. @@ -234,6 +263,7 @@ const ( // - return 7 (\b111) => [SQL_MINIMUM_GRAMMAR, SQL_CORE_GRAMMAR, SQL_EXTENDED_GRAMMAR]. // Valid SQL grammar levels are described under `arrow.flight.protocol.sql.SupportedSqlGrammar`. SqlInfo_SQL_SUPPORTED_GRAMMAR SqlInfo = 525 + // // Retrieves the supported ANSI92 SQL grammar level. // // Returns an int32 bitmask value representing the supported ANSI92 SQL grammar level. @@ -250,12 +280,14 @@ const ( // - return 7 (\b111) => [ANSI92_ENTRY_SQL, ANSI92_INTERMEDIATE_SQL, ANSI92_FULL_SQL]. // Valid ANSI92 SQL grammar levels are described under `arrow.flight.protocol.sql.SupportedAnsi92SqlGrammarLevel`. SqlInfo_SQL_ANSI92_SUPPORTED_LEVEL SqlInfo = 526 + // // Retrieves a boolean value indicating whether the SQL Integrity Enhancement Facility is supported. // // Returns: // - false: if the SQL Integrity Enhancement Facility is supported; // - true: if the SQL Integrity Enhancement Facility is supported. SqlInfo_SQL_SUPPORTS_INTEGRITY_ENHANCEMENT_FACILITY SqlInfo = 527 + // // Retrieves the support level for SQL OUTER JOINs. // // Returns a int32 ordinal for the SQL ordering being used, as described in @@ -265,14 +297,17 @@ const ( SqlInfo_SQL_SCHEMA_TERM SqlInfo = 529 // Retrieves a UTF-8 string with the preferred term for "procedure". SqlInfo_SQL_PROCEDURE_TERM SqlInfo = 530 + // // Retrieves a UTF-8 string with the preferred term for "catalog". // If a empty string is returned its assumed that the server does NOT supports catalogs. SqlInfo_SQL_CATALOG_TERM SqlInfo = 531 + // // Retrieves a boolean value indicating whether a catalog appears at the start of a fully qualified table name. // // - false: if a catalog does not appear at the start of a fully qualified table name; // - true: if a catalog appears at the start of a fully qualified table name. SqlInfo_SQL_CATALOG_AT_START SqlInfo = 532 + // // Retrieves the supported actions for a SQL schema. // // Returns an int32 bitmask value representing the supported actions for a SQL schema. @@ -289,6 +324,7 @@ const ( // - return 7 (\b111) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]. // Valid actions for a SQL schema described under `arrow.flight.protocol.sql.SqlSupportedElementActions`. SqlInfo_SQL_SCHEMAS_SUPPORTED_ACTIONS SqlInfo = 533 + // // Retrieves the supported actions for a SQL schema. // // Returns an int32 bitmask value representing the supported actions for a SQL catalog. @@ -305,6 +341,7 @@ const ( // - return 7 (\b111) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]. // Valid actions for a SQL catalog are described under `arrow.flight.protocol.sql.SqlSupportedElementActions`. SqlInfo_SQL_CATALOGS_SUPPORTED_ACTIONS SqlInfo = 534 + // // Retrieves the supported SQL positioned commands. // // Returns an int32 bitmask value representing the supported SQL positioned commands. @@ -317,12 +354,14 @@ const ( // - return 3 (\b11) => [SQL_POSITIONED_DELETE, SQL_POSITIONED_UPDATE]. // Valid SQL positioned commands are described under `arrow.flight.protocol.sql.SqlSupportedPositionedCommands`. SqlInfo_SQL_SUPPORTED_POSITIONED_COMMANDS SqlInfo = 535 + // // Retrieves a boolean value indicating whether SELECT FOR UPDATE statements are supported. // // Returns: // - false: if SELECT FOR UPDATE statements are unsupported; // - true: if SELECT FOR UPDATE statements are supported. SqlInfo_SQL_SELECT_FOR_UPDATE_SUPPORTED SqlInfo = 536 + // // Retrieves a boolean value indicating whether stored procedure calls that use the stored procedure escape syntax // are supported. // @@ -330,6 +369,7 @@ const ( // - false: if stored procedure calls that use the stored procedure escape syntax are unsupported; // - true: if stored procedure calls that use the stored procedure escape syntax are supported. SqlInfo_SQL_STORED_PROCEDURES_SUPPORTED SqlInfo = 537 + // // Retrieves the supported SQL subqueries. // // Returns an int32 bitmask value representing the supported SQL subqueries. @@ -355,12 +395,14 @@ const ( // - ... // Valid SQL subqueries are described under `arrow.flight.protocol.sql.SqlSupportedSubqueries`. SqlInfo_SQL_SUPPORTED_SUBQUERIES SqlInfo = 538 + // // Retrieves a boolean value indicating whether correlated subqueries are supported. // // Returns: // - false: if correlated subqueries are unsupported; // - true: if correlated subqueries are supported. SqlInfo_SQL_CORRELATED_SUBQUERIES_SUPPORTED SqlInfo = 539 + // // Retrieves the supported SQL UNIONs. // // Returns an int32 bitmask value representing the supported SQL UNIONs. @@ -393,6 +435,7 @@ const ( SqlInfo_SQL_MAX_CONNECTIONS SqlInfo = 549 // Retrieves a int64 value the maximum number of characters allowed in a cursor name. SqlInfo_SQL_MAX_CURSOR_NAME_LENGTH SqlInfo = 550 + // // Retrieves a int64 value representing the maximum number of bytes allowed for an index, // including all of the parts of the index. SqlInfo_SQL_MAX_INDEX_LENGTH SqlInfo = 551 @@ -404,15 +447,17 @@ const ( SqlInfo_SQL_MAX_CATALOG_NAME_LENGTH SqlInfo = 554 // Retrieves a int64 value representing the maximum number of bytes allowed in a single row. SqlInfo_SQL_MAX_ROW_SIZE SqlInfo = 555 + // // Retrieves a boolean indicating whether the return value for the JDBC method getMaxRowSize includes the SQL // data types LONGVARCHAR and LONGVARBINARY. // // Returns: - // - false: if return value for the JDBC method getMaxRowSize does - // not include the SQL data types LONGVARCHAR and LONGVARBINARY; - // - true: if return value for the JDBC method getMaxRowSize includes - // the SQL data types LONGVARCHAR and LONGVARBINARY. + // - false: if return value for the JDBC method getMaxRowSize does + // not include the SQL data types LONGVARCHAR and LONGVARBINARY; + // - true: if return value for the JDBC method getMaxRowSize includes + // the SQL data types LONGVARCHAR and LONGVARBINARY. SqlInfo_SQL_MAX_ROW_SIZE_INCLUDES_BLOBS SqlInfo = 556 + // // Retrieves a int64 value representing the maximum number of characters allowed for an SQL statement; // a result of 0 (zero) means that there is no limit or the limit is not known. SqlInfo_SQL_MAX_STATEMENT_LENGTH SqlInfo = 557 @@ -424,11 +469,13 @@ const ( SqlInfo_SQL_MAX_TABLES_IN_SELECT SqlInfo = 560 // Retrieves a int64 value representing the maximum number of characters allowed in a user name. SqlInfo_SQL_MAX_USERNAME_LENGTH SqlInfo = 561 + // // Retrieves this database's default transaction isolation level as described in // `arrow.flight.protocol.sql.SqlTransactionIsolationLevel`. // // Returns a int32 ordinal for the SQL transaction isolation level. SqlInfo_SQL_DEFAULT_TRANSACTION_ISOLATION SqlInfo = 562 + // // Retrieves a boolean value indicating whether transactions are supported. If not, invoking the method commit is a // noop, and the isolation level is `arrow.flight.protocol.sql.SqlTransactionIsolationLevel.TRANSACTION_NONE`. // @@ -436,6 +483,7 @@ const ( // - false: if transactions are unsupported; // - true: if transactions are supported. SqlInfo_SQL_TRANSACTIONS_SUPPORTED SqlInfo = 563 + // // Retrieves the supported transactions isolation levels. // // Returns an int32 bitmask value representing the supported transactions isolation levels. @@ -462,6 +510,7 @@ const ( // - ... // Valid SQL positioned commands are described under `arrow.flight.protocol.sql.SqlTransactionIsolationLevel`. SqlInfo_SQL_SUPPORTED_TRANSACTIONS_ISOLATION_LEVELS SqlInfo = 564 + // // Retrieves a boolean value indicating whether a data definition statement within a transaction forces // the transaction to commit. // @@ -469,12 +518,14 @@ const ( // - false: if a data definition statement within a transaction does not force the transaction to commit; // - true: if a data definition statement within a transaction forces the transaction to commit. SqlInfo_SQL_DATA_DEFINITION_CAUSES_TRANSACTION_COMMIT SqlInfo = 565 + // // Retrieves a boolean value indicating whether a data definition statement within a transaction is ignored. // // Returns: // - false: if a data definition statement within a transaction is taken into account; // - true: a data definition statement within a transaction is ignored. SqlInfo_SQL_DATA_DEFINITIONS_IN_TRANSACTIONS_IGNORED SqlInfo = 566 + // // Retrieves an int32 bitmask value representing the supported result set types. // The returned bitmask should be parsed in order to retrieve the supported result set types. // @@ -491,6 +542,7 @@ const ( // - ... // Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetType`. SqlInfo_SQL_SUPPORTED_RESULT_SET_TYPES SqlInfo = 567 + // // Returns an int32 bitmask value concurrency types supported for // `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_UNSPECIFIED`. // @@ -505,6 +557,7 @@ const ( // - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] // Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. SqlInfo_SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_UNSPECIFIED SqlInfo = 568 + // // Returns an int32 bitmask value concurrency types supported for // `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_FORWARD_ONLY`. // @@ -519,6 +572,7 @@ const ( // - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] // Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. SqlInfo_SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_FORWARD_ONLY SqlInfo = 569 + // // Returns an int32 bitmask value concurrency types supported for // `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_SCROLL_SENSITIVE`. // @@ -533,6 +587,7 @@ const ( // - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] // Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. SqlInfo_SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_SCROLL_SENSITIVE SqlInfo = 570 + // // Returns an int32 bitmask value concurrency types supported for // `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE`. // @@ -547,29 +602,34 @@ const ( // - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] // Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. SqlInfo_SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_SCROLL_INSENSITIVE SqlInfo = 571 + // // Retrieves a boolean value indicating whether this database supports batch updates. // // - false: if this database does not support batch updates; // - true: if this database supports batch updates. SqlInfo_SQL_BATCH_UPDATES_SUPPORTED SqlInfo = 572 + // // Retrieves a boolean value indicating whether this database supports savepoints. // // Returns: // - false: if this database does not support savepoints; // - true: if this database supports savepoints. SqlInfo_SQL_SAVEPOINTS_SUPPORTED SqlInfo = 573 + // // Retrieves a boolean value indicating whether named parameters are supported in callable statements. // // Returns: // - false: if named parameters in callable statements are unsupported; // - true: if named parameters in callable statements are supported. SqlInfo_SQL_NAMED_PARAMETERS_SUPPORTED SqlInfo = 574 + // // Retrieves a boolean value indicating whether updates made to a LOB are made on a copy or directly to the LOB. // // Returns: // - false: if updates made to a LOB are made directly to the LOB; // - true: if updates made to a LOB are made on a copy. SqlInfo_SQL_LOCATORS_UPDATE_COPY SqlInfo = 575 + // // Retrieves a boolean value indicating whether invoking user-defined or vendor functions // using the stored procedure escape syntax is supported. // @@ -1642,7 +1702,7 @@ func (SqlSupportsConvert) EnumDescriptor() ([]byte, []int) { return file_FlightSql_proto_rawDescGZIP(), []int{16} } -// * +//* // The JDBC/ODBC-defined type of any object. // All the values here are the sames as in the JDBC and ODBC specs. type XdbcDataType int32 @@ -1757,7 +1817,7 @@ func (XdbcDataType) EnumDescriptor() ([]byte, []int) { return file_FlightSql_proto_rawDescGZIP(), []int{17} } -// * +//* // Detailed subtype information for XDBC_TYPE_DATETIME and XDBC_TYPE_INTERVAL. type XdbcDatetimeSubcode int32 @@ -1898,13 +1958,13 @@ func (XdbcDatetimeSubcode) EnumDescriptor() ([]byte, []int) { type Nullable int32 const ( - // * + //* // Indicates that the fields does not allow the use of null values. Nullable_NULLABILITY_NO_NULLS Nullable = 0 - // * + //* // Indicates that the fields allow the use of null values. Nullable_NULLABILITY_NULLABLE Nullable = 1 - // * + //* // Indicates that nullability of the fields can not be determined. Nullable_NULLABILITY_UNKNOWN Nullable = 2 ) @@ -1953,21 +2013,21 @@ func (Nullable) EnumDescriptor() ([]byte, []int) { type Searchable int32 const ( - // * + //* // Indicates that column can not be used in a WHERE clause. Searchable_SEARCHABLE_NONE Searchable = 0 - // * + //* // Indicates that the column can be used in a WHERE clause if it is using a // LIKE operator. Searchable_SEARCHABLE_CHAR Searchable = 1 - // * + //* // Indicates that the column can be used In a WHERE clause with any // operator other than LIKE. // - // - Allowed operators: comparison, quantified comparison, BETWEEN, - // DISTINCT, IN, MATCH, and UNIQUE. + // - Allowed operators: comparison, quantified comparison, BETWEEN, + // DISTINCT, IN, MATCH, and UNIQUE. Searchable_SEARCHABLE_BASIC Searchable = 2 - // * + //* // Indicates that the column can be used in a WHERE clause using any operator. Searchable_SEARCHABLE_FULL Searchable = 3 ) @@ -2233,23 +2293,22 @@ func (ActionCancelQueryResult_CancelResult) EnumDescriptor() ([]byte, []int) { return file_FlightSql_proto_rawDescGZIP(), []int{29, 0} } +// // Represents a metadata request. Used in the command member of FlightDescriptor // for the following RPC calls: -// - GetSchema: return the Arrow schema of the query. -// - GetFlightInfo: execute the metadata request. +// - GetSchema: return the Arrow schema of the query. +// - GetFlightInfo: execute the metadata request. // // The returned Arrow schema will be: // < -// -// info_name: uint32 not null, -// value: dense_union< -// string_value: utf8, -// bool_value: bool, -// bigint_value: int64, -// int32_bitmask: int32, -// string_list: list -// int32_to_int32_list_map: map> -// +// info_name: uint32 not null, +// value: dense_union< +// string_value: utf8, +// bool_value: bool, +// bigint_value: int64, +// int32_bitmask: int32, +// string_list: list +// int32_to_int32_list_map: map> // > // where there is one row per requested piece of metadata information. type CommandGetSqlInfo struct { @@ -2257,6 +2316,7 @@ type CommandGetSqlInfo struct { sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields + // // Values are modelled after ODBC's SQLGetInfo() function. This information is intended to provide // Flight SQL clients with basic, SQL syntax and SQL functions related information. // More information types can be added in future releases. @@ -2316,62 +2376,61 @@ func (x *CommandGetSqlInfo) GetInfo() []uint32 { return nil } +// // Represents a request to retrieve information about data type supported on a Flight SQL enabled backend. // Used in the command member of FlightDescriptor for the following RPC calls: -// - GetSchema: return the schema of the query. -// - GetFlightInfo: execute the catalog metadata request. +// - GetSchema: return the schema of the query. +// - GetFlightInfo: execute the catalog metadata request. // // The returned schema will be: // < -// -// type_name: utf8 not null (The name of the data type, for example: VARCHAR, INTEGER, etc), -// data_type: int32 not null (The SQL data type), -// column_size: int32 (The maximum size supported by that column. -// In case of exact numeric types, this represents the maximum precision. -// In case of string types, this represents the character length. -// In case of datetime data types, this represents the length in characters of the string representation. -// NULL is returned for data types where column size is not applicable.), -// literal_prefix: utf8 (Character or characters used to prefix a literal, NULL is returned for -// data types where a literal prefix is not applicable.), -// literal_suffix: utf8 (Character or characters used to terminate a literal, -// NULL is returned for data types where a literal suffix is not applicable.), -// create_params: list -// (A list of keywords corresponding to which parameters can be used when creating -// a column for that specific type. -// NULL is returned if there are no parameters for the data type definition.), -// nullable: int32 not null (Shows if the data type accepts a NULL value. The possible values can be seen in the -// Nullable enum.), -// case_sensitive: bool not null (Shows if a character data type is case-sensitive in collations and comparisons), -// searchable: int32 not null (Shows how the data type is used in a WHERE clause. The possible values can be seen in the -// Searchable enum.), -// unsigned_attribute: bool (Shows if the data type is unsigned. NULL is returned if the attribute is -// not applicable to the data type or the data type is not numeric.), -// fixed_prec_scale: bool not null (Shows if the data type has predefined fixed precision and scale.), -// auto_increment: bool (Shows if the data type is auto incremental. NULL is returned if the attribute -// is not applicable to the data type or the data type is not numeric.), -// local_type_name: utf8 (Localized version of the data source-dependent name of the data type. NULL -// is returned if a localized name is not supported by the data source), -// minimum_scale: int32 (The minimum scale of the data type on the data source. -// If a data type has a fixed scale, the MINIMUM_SCALE and MAXIMUM_SCALE -// columns both contain this value. NULL is returned if scale is not applicable.), -// maximum_scale: int32 (The maximum scale of the data type on the data source. -// NULL is returned if scale is not applicable.), -// sql_data_type: int32 not null (The value of the SQL DATA TYPE which has the same values -// as data_type value. Except for interval and datetime, which -// uses generic values. More info about those types can be -// obtained through datetime_subcode. The possible values can be seen -// in the XdbcDataType enum.), -// datetime_subcode: int32 (Only used when the SQL DATA TYPE is interval or datetime. It contains -// its sub types. For type different from interval and datetime, this value -// is NULL. The possible values can be seen in the XdbcDatetimeSubcode enum.), -// num_prec_radix: int32 (If the data type is an approximate numeric type, this column contains -// the value 2 to indicate that COLUMN_SIZE specifies a number of bits. For -// exact numeric types, this column contains the value 10 to indicate that -// column size specifies a number of decimal digits. Otherwise, this column is NULL.), -// interval_precision: int32 (If the data type is an interval data type, then this column contains the value -// of the interval leading precision. Otherwise, this column is NULL. This fields -// is only relevant to be used by ODBC). -// +// type_name: utf8 not null (The name of the data type, for example: VARCHAR, INTEGER, etc), +// data_type: int32 not null (The SQL data type), +// column_size: int32 (The maximum size supported by that column. +// In case of exact numeric types, this represents the maximum precision. +// In case of string types, this represents the character length. +// In case of datetime data types, this represents the length in characters of the string representation. +// NULL is returned for data types where column size is not applicable.), +// literal_prefix: utf8 (Character or characters used to prefix a literal, NULL is returned for +// data types where a literal prefix is not applicable.), +// literal_suffix: utf8 (Character or characters used to terminate a literal, +// NULL is returned for data types where a literal suffix is not applicable.), +// create_params: list +// (A list of keywords corresponding to which parameters can be used when creating +// a column for that specific type. +// NULL is returned if there are no parameters for the data type definition.), +// nullable: int32 not null (Shows if the data type accepts a NULL value. The possible values can be seen in the +// Nullable enum.), +// case_sensitive: bool not null (Shows if a character data type is case-sensitive in collations and comparisons), +// searchable: int32 not null (Shows how the data type is used in a WHERE clause. The possible values can be seen in the +// Searchable enum.), +// unsigned_attribute: bool (Shows if the data type is unsigned. NULL is returned if the attribute is +// not applicable to the data type or the data type is not numeric.), +// fixed_prec_scale: bool not null (Shows if the data type has predefined fixed precision and scale.), +// auto_increment: bool (Shows if the data type is auto incremental. NULL is returned if the attribute +// is not applicable to the data type or the data type is not numeric.), +// local_type_name: utf8 (Localized version of the data source-dependent name of the data type. NULL +// is returned if a localized name is not supported by the data source), +// minimum_scale: int32 (The minimum scale of the data type on the data source. +// If a data type has a fixed scale, the MINIMUM_SCALE and MAXIMUM_SCALE +// columns both contain this value. NULL is returned if scale is not applicable.), +// maximum_scale: int32 (The maximum scale of the data type on the data source. +// NULL is returned if scale is not applicable.), +// sql_data_type: int32 not null (The value of the SQL DATA TYPE which has the same values +// as data_type value. Except for interval and datetime, which +// uses generic values. More info about those types can be +// obtained through datetime_subcode. The possible values can be seen +// in the XdbcDataType enum.), +// datetime_subcode: int32 (Only used when the SQL DATA TYPE is interval or datetime. It contains +// its sub types. For type different from interval and datetime, this value +// is NULL. The possible values can be seen in the XdbcDatetimeSubcode enum.), +// num_prec_radix: int32 (If the data type is an approximate numeric type, this column contains +// the value 2 to indicate that COLUMN_SIZE specifies a number of bits. For +// exact numeric types, this column contains the value 10 to indicate that +// column size specifies a number of decimal digits. Otherwise, this column is NULL.), +// interval_precision: int32 (If the data type is an interval data type, then this column contains the value +// of the interval leading precision. Otherwise, this column is NULL. This fields +// is only relevant to be used by ODBC). // > // The returned data should be ordered by data_type and then by type_name. type CommandGetXdbcTypeInfo struct { @@ -2379,6 +2438,7 @@ type CommandGetXdbcTypeInfo struct { sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields + // // Specifies the data type to search for the info. DataType *int32 `protobuf:"varint,1,opt,name=data_type,json=dataType,proto3,oneof" json:"data_type,omitempty"` } @@ -2422,17 +2482,16 @@ func (x *CommandGetXdbcTypeInfo) GetDataType() int32 { return 0 } +// // Represents a request to retrieve the list of catalogs on a Flight SQL enabled backend. // The definition of a catalog depends on vendor/implementation. It is usually the database itself // Used in the command member of FlightDescriptor for the following RPC calls: -// - GetSchema: return the Arrow schema of the query. -// - GetFlightInfo: execute the catalog metadata request. +// - GetSchema: return the Arrow schema of the query. +// - GetFlightInfo: execute the catalog metadata request. // // The returned Arrow schema will be: // < -// -// catalog_name: utf8 not null -// +// catalog_name: utf8 not null // > // The returned data should be ordered by catalog_name. type CommandGetCatalogs struct { @@ -2473,18 +2532,17 @@ func (*CommandGetCatalogs) Descriptor() ([]byte, []int) { return file_FlightSql_proto_rawDescGZIP(), []int{2} } +// // Represents a request to retrieve the list of database schemas on a Flight SQL enabled backend. // The definition of a database schema depends on vendor/implementation. It is usually a collection of tables. // Used in the command member of FlightDescriptor for the following RPC calls: -// - GetSchema: return the Arrow schema of the query. -// - GetFlightInfo: execute the catalog metadata request. +// - GetSchema: return the Arrow schema of the query. +// - GetFlightInfo: execute the catalog metadata request. // // The returned Arrow schema will be: // < -// -// catalog_name: utf8, -// db_schema_name: utf8 not null -// +// catalog_name: utf8, +// db_schema_name: utf8 not null // > // The returned data should be ordered by catalog_name, then db_schema_name. type CommandGetDbSchemas struct { @@ -2492,15 +2550,17 @@ type CommandGetDbSchemas struct { sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields + // // Specifies the Catalog to search for the tables. // An empty string retrieves those without a catalog. // If omitted the catalog name should not be used to narrow the search. Catalog *string `protobuf:"bytes,1,opt,name=catalog,proto3,oneof" json:"catalog,omitempty"` + // // Specifies a filter pattern for schemas to search for. // When no db_schema_filter_pattern is provided, the pattern will not be used to narrow the search. // In the pattern string, two special characters can be used to denote matching rules: - // - "%" means to match any substring with 0 or more characters. - // - "_" means to match any one character. + // - "%" means to match any substring with 0 or more characters. + // - "_" means to match any one character. DbSchemaFilterPattern *string `protobuf:"bytes,2,opt,name=db_schema_filter_pattern,json=dbSchemaFilterPattern,proto3,oneof" json:"db_schema_filter_pattern,omitempty"` } @@ -2550,56 +2610,58 @@ func (x *CommandGetDbSchemas) GetDbSchemaFilterPattern() string { return "" } +// // Represents a request to retrieve the list of tables, and optionally their schemas, on a Flight SQL enabled backend. // Used in the command member of FlightDescriptor for the following RPC calls: -// - GetSchema: return the Arrow schema of the query. -// - GetFlightInfo: execute the catalog metadata request. +// - GetSchema: return the Arrow schema of the query. +// - GetFlightInfo: execute the catalog metadata request. // // The returned Arrow schema will be: // < -// -// catalog_name: utf8, -// db_schema_name: utf8, -// table_name: utf8 not null, -// table_type: utf8 not null, -// [optional] table_schema: bytes not null (schema of the table as described in Schema.fbs::Schema, -// it is serialized as an IPC message.) -// +// catalog_name: utf8, +// db_schema_name: utf8, +// table_name: utf8 not null, +// table_type: utf8 not null, +// [optional] table_schema: bytes not null (schema of the table as described in Schema.fbs::Schema, +// it is serialized as an IPC message.) // > // Fields on table_schema may contain the following metadata: -// - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name -// - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name -// - ARROW:FLIGHT:SQL:TABLE_NAME - Table name -// - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. -// - ARROW:FLIGHT:SQL:PRECISION - Column precision/size -// - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable -// - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. -// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. -// - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. -// - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. -// +// - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name +// - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name +// - ARROW:FLIGHT:SQL:TABLE_NAME - Table name +// - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. +// - ARROW:FLIGHT:SQL:PRECISION - Column precision/size +// - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable +// - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. +// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. +// - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. +// - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. // The returned data should be ordered by catalog_name, db_schema_name, table_name, then table_type, followed by table_schema if requested. type CommandGetTables struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields + // // Specifies the Catalog to search for the tables. // An empty string retrieves those without a catalog. // If omitted the catalog name should not be used to narrow the search. Catalog *string `protobuf:"bytes,1,opt,name=catalog,proto3,oneof" json:"catalog,omitempty"` + // // Specifies a filter pattern for schemas to search for. // When no db_schema_filter_pattern is provided, all schemas matching other filters are searched. // In the pattern string, two special characters can be used to denote matching rules: - // - "%" means to match any substring with 0 or more characters. - // - "_" means to match any one character. + // - "%" means to match any substring with 0 or more characters. + // - "_" means to match any one character. DbSchemaFilterPattern *string `protobuf:"bytes,2,opt,name=db_schema_filter_pattern,json=dbSchemaFilterPattern,proto3,oneof" json:"db_schema_filter_pattern,omitempty"` + // // Specifies a filter pattern for tables to search for. // When no table_name_filter_pattern is provided, all tables matching other filters are searched. // In the pattern string, two special characters can be used to denote matching rules: - // - "%" means to match any substring with 0 or more characters. - // - "_" means to match any one character. + // - "%" means to match any substring with 0 or more characters. + // - "_" means to match any one character. TableNameFilterPattern *string `protobuf:"bytes,3,opt,name=table_name_filter_pattern,json=tableNameFilterPattern,proto3,oneof" json:"table_name_filter_pattern,omitempty"` + // // Specifies a filter of table types which must match. // The table types depend on vendor/implementation. It is usually used to separate tables from views or system tables. // TABLE, VIEW, and SYSTEM TABLE are commonly supported. @@ -2675,18 +2737,17 @@ func (x *CommandGetTables) GetIncludeSchema() bool { return false } +// // Represents a request to retrieve the list of table types on a Flight SQL enabled backend. // The table types depend on vendor/implementation. It is usually used to separate tables from views or system tables. // TABLE, VIEW, and SYSTEM TABLE are commonly supported. // Used in the command member of FlightDescriptor for the following RPC calls: -// - GetSchema: return the Arrow schema of the query. -// - GetFlightInfo: execute the catalog metadata request. +// - GetSchema: return the Arrow schema of the query. +// - GetFlightInfo: execute the catalog metadata request. // // The returned Arrow schema will be: // < -// -// table_type: utf8 not null -// +// table_type: utf8 not null // > // The returned data should be ordered by table_type. type CommandGetTableTypes struct { @@ -2727,21 +2788,20 @@ func (*CommandGetTableTypes) Descriptor() ([]byte, []int) { return file_FlightSql_proto_rawDescGZIP(), []int{5} } +// // Represents a request to retrieve the primary keys of a table on a Flight SQL enabled backend. // Used in the command member of FlightDescriptor for the following RPC calls: -// - GetSchema: return the Arrow schema of the query. -// - GetFlightInfo: execute the catalog metadata request. +// - GetSchema: return the Arrow schema of the query. +// - GetFlightInfo: execute the catalog metadata request. // // The returned Arrow schema will be: // < -// -// catalog_name: utf8, -// db_schema_name: utf8, -// table_name: utf8 not null, -// column_name: utf8 not null, -// key_name: utf8, -// key_sequence: int32 not null -// +// catalog_name: utf8, +// db_schema_name: utf8, +// table_name: utf8 not null, +// column_name: utf8 not null, +// key_name: utf8, +// key_sequence: int32 not null // > // The returned data should be ordered by catalog_name, db_schema_name, table_name, key_name, then key_sequence. type CommandGetPrimaryKeys struct { @@ -2749,10 +2809,12 @@ type CommandGetPrimaryKeys struct { sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields + // // Specifies the catalog to search for the table. // An empty string retrieves those without a catalog. // If omitted the catalog name should not be used to narrow the search. Catalog *string `protobuf:"bytes,1,opt,name=catalog,proto3,oneof" json:"catalog,omitempty"` + // // Specifies the schema to search for the table. // An empty string retrieves those without a schema. // If omitted the schema name should not be used to narrow the search. @@ -2814,29 +2876,28 @@ func (x *CommandGetPrimaryKeys) GetTable() string { return "" } +// // Represents a request to retrieve a description of the foreign key columns that reference the given table's // primary key columns (the foreign keys exported by a table) of a table on a Flight SQL enabled backend. // Used in the command member of FlightDescriptor for the following RPC calls: -// - GetSchema: return the Arrow schema of the query. -// - GetFlightInfo: execute the catalog metadata request. +// - GetSchema: return the Arrow schema of the query. +// - GetFlightInfo: execute the catalog metadata request. // // The returned Arrow schema will be: // < -// -// pk_catalog_name: utf8, -// pk_db_schema_name: utf8, -// pk_table_name: utf8 not null, -// pk_column_name: utf8 not null, -// fk_catalog_name: utf8, -// fk_db_schema_name: utf8, -// fk_table_name: utf8 not null, -// fk_column_name: utf8 not null, -// key_sequence: int32 not null, -// fk_key_name: utf8, -// pk_key_name: utf8, -// update_rule: uint8 not null, -// delete_rule: uint8 not null -// +// pk_catalog_name: utf8, +// pk_db_schema_name: utf8, +// pk_table_name: utf8 not null, +// pk_column_name: utf8 not null, +// fk_catalog_name: utf8, +// fk_db_schema_name: utf8, +// fk_table_name: utf8 not null, +// fk_column_name: utf8 not null, +// key_sequence: int32 not null, +// fk_key_name: utf8, +// pk_key_name: utf8, +// update_rule: uint8 not null, +// delete_rule: uint8 not null // > // The returned data should be ordered by fk_catalog_name, fk_db_schema_name, fk_table_name, fk_key_name, then key_sequence. // update_rule and delete_rule returns a byte that is equivalent to actions declared on UpdateDeleteRules enum. @@ -2845,10 +2906,12 @@ type CommandGetExportedKeys struct { sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields + // // Specifies the catalog to search for the foreign key table. // An empty string retrieves those without a catalog. // If omitted the catalog name should not be used to narrow the search. Catalog *string `protobuf:"bytes,1,opt,name=catalog,proto3,oneof" json:"catalog,omitempty"` + // // Specifies the schema to search for the foreign key table. // An empty string retrieves those without a schema. // If omitted the schema name should not be used to narrow the search. @@ -2910,45 +2973,46 @@ func (x *CommandGetExportedKeys) GetTable() string { return "" } +// // Represents a request to retrieve the foreign keys of a table on a Flight SQL enabled backend. // Used in the command member of FlightDescriptor for the following RPC calls: -// - GetSchema: return the Arrow schema of the query. -// - GetFlightInfo: execute the catalog metadata request. +// - GetSchema: return the Arrow schema of the query. +// - GetFlightInfo: execute the catalog metadata request. // // The returned Arrow schema will be: // < -// -// pk_catalog_name: utf8, -// pk_db_schema_name: utf8, -// pk_table_name: utf8 not null, -// pk_column_name: utf8 not null, -// fk_catalog_name: utf8, -// fk_db_schema_name: utf8, -// fk_table_name: utf8 not null, -// fk_column_name: utf8 not null, -// key_sequence: int32 not null, -// fk_key_name: utf8, -// pk_key_name: utf8, -// update_rule: uint8 not null, -// delete_rule: uint8 not null -// +// pk_catalog_name: utf8, +// pk_db_schema_name: utf8, +// pk_table_name: utf8 not null, +// pk_column_name: utf8 not null, +// fk_catalog_name: utf8, +// fk_db_schema_name: utf8, +// fk_table_name: utf8 not null, +// fk_column_name: utf8 not null, +// key_sequence: int32 not null, +// fk_key_name: utf8, +// pk_key_name: utf8, +// update_rule: uint8 not null, +// delete_rule: uint8 not null // > // The returned data should be ordered by pk_catalog_name, pk_db_schema_name, pk_table_name, pk_key_name, then key_sequence. // update_rule and delete_rule returns a byte that is equivalent to actions: -// - 0 = CASCADE -// - 1 = RESTRICT -// - 2 = SET NULL -// - 3 = NO ACTION -// - 4 = SET DEFAULT +// - 0 = CASCADE +// - 1 = RESTRICT +// - 2 = SET NULL +// - 3 = NO ACTION +// - 4 = SET DEFAULT type CommandGetImportedKeys struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields + // // Specifies the catalog to search for the primary key table. // An empty string retrieves those without a catalog. // If omitted the catalog name should not be used to narrow the search. Catalog *string `protobuf:"bytes,1,opt,name=catalog,proto3,oneof" json:"catalog,omitempty"` + // // Specifies the schema to search for the primary key table. // An empty string retrieves those without a schema. // If omitted the schema name should not be used to narrow the search. @@ -3010,67 +3074,66 @@ func (x *CommandGetImportedKeys) GetTable() string { return "" } +// // Represents a request to retrieve a description of the foreign key columns in the given foreign key table that // reference the primary key or the columns representing a unique constraint of the parent table (could be the same // or a different table) on a Flight SQL enabled backend. // Used in the command member of FlightDescriptor for the following RPC calls: -// - GetSchema: return the Arrow schema of the query. -// - GetFlightInfo: execute the catalog metadata request. +// - GetSchema: return the Arrow schema of the query. +// - GetFlightInfo: execute the catalog metadata request. // // The returned Arrow schema will be: // < -// -// pk_catalog_name: utf8, -// pk_db_schema_name: utf8, -// pk_table_name: utf8 not null, -// pk_column_name: utf8 not null, -// fk_catalog_name: utf8, -// fk_db_schema_name: utf8, -// fk_table_name: utf8 not null, -// fk_column_name: utf8 not null, -// key_sequence: int32 not null, -// fk_key_name: utf8, -// pk_key_name: utf8, -// update_rule: uint8 not null, -// delete_rule: uint8 not null -// +// pk_catalog_name: utf8, +// pk_db_schema_name: utf8, +// pk_table_name: utf8 not null, +// pk_column_name: utf8 not null, +// fk_catalog_name: utf8, +// fk_db_schema_name: utf8, +// fk_table_name: utf8 not null, +// fk_column_name: utf8 not null, +// key_sequence: int32 not null, +// fk_key_name: utf8, +// pk_key_name: utf8, +// update_rule: uint8 not null, +// delete_rule: uint8 not null // > // The returned data should be ordered by pk_catalog_name, pk_db_schema_name, pk_table_name, pk_key_name, then key_sequence. // update_rule and delete_rule returns a byte that is equivalent to actions: -// - 0 = CASCADE -// - 1 = RESTRICT -// - 2 = SET NULL -// - 3 = NO ACTION -// - 4 = SET DEFAULT +// - 0 = CASCADE +// - 1 = RESTRICT +// - 2 = SET NULL +// - 3 = NO ACTION +// - 4 = SET DEFAULT type CommandGetCrossReference struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields - // * + //* // The catalog name where the parent table is. // An empty string retrieves those without a catalog. // If omitted the catalog name should not be used to narrow the search. PkCatalog *string `protobuf:"bytes,1,opt,name=pk_catalog,json=pkCatalog,proto3,oneof" json:"pk_catalog,omitempty"` - // * + //* // The Schema name where the parent table is. // An empty string retrieves those without a schema. // If omitted the schema name should not be used to narrow the search. PkDbSchema *string `protobuf:"bytes,2,opt,name=pk_db_schema,json=pkDbSchema,proto3,oneof" json:"pk_db_schema,omitempty"` - // * + //* // The parent table name. It cannot be null. PkTable string `protobuf:"bytes,3,opt,name=pk_table,json=pkTable,proto3" json:"pk_table,omitempty"` - // * + //* // The catalog name where the foreign table is. // An empty string retrieves those without a catalog. // If omitted the catalog name should not be used to narrow the search. FkCatalog *string `protobuf:"bytes,4,opt,name=fk_catalog,json=fkCatalog,proto3,oneof" json:"fk_catalog,omitempty"` - // * + //* // The schema name where the foreign table is. // An empty string retrieves those without a schema. // If omitted the schema name should not be used to narrow the search. FkDbSchema *string `protobuf:"bytes,5,opt,name=fk_db_schema,json=fkDbSchema,proto3,oneof" json:"fk_db_schema,omitempty"` - // * + //* // The foreign table name. It cannot be null. FkTable string `protobuf:"bytes,6,opt,name=fk_table,json=fkTable,proto3" json:"fk_table,omitempty"` } @@ -3149,6 +3212,7 @@ func (x *CommandGetCrossReference) GetFkTable() string { return "" } +// // Request message for the "CreatePreparedStatement" action on a Flight SQL enabled backend. type ActionCreatePreparedStatementRequest struct { state protoimpl.MessageState @@ -3208,6 +3272,7 @@ func (x *ActionCreatePreparedStatementRequest) GetTransactionId() []byte { return nil } +// // An embedded message describing a Substrait plan to execute. type SubstraitPlan struct { state protoimpl.MessageState @@ -3271,6 +3336,7 @@ func (x *SubstraitPlan) GetVersion() string { return "" } +// // Request message for the "CreatePreparedSubstraitPlan" action on a Flight SQL enabled backend. type ActionCreatePreparedSubstraitPlanRequest struct { state protoimpl.MessageState @@ -3330,6 +3396,7 @@ func (x *ActionCreatePreparedSubstraitPlanRequest) GetTransactionId() []byte { return nil } +// // Wrap the result of a "CreatePreparedStatement" or "CreatePreparedSubstraitPlan" action. // // The resultant PreparedStatement can be closed either: @@ -3405,6 +3472,7 @@ func (x *ActionCreatePreparedStatementResult) GetParameterSchema() []byte { return nil } +// // Request message for the "ClosePreparedStatement" action on a Flight SQL enabled backend. // Closes server resources associated with the prepared statement handle. type ActionClosePreparedStatementRequest struct { @@ -3455,6 +3523,7 @@ func (x *ActionClosePreparedStatementRequest) GetPreparedStatementHandle() []byt return nil } +// // Request message for the "BeginTransaction" action. // Begins a transaction. type ActionBeginTransactionRequest struct { @@ -3495,6 +3564,7 @@ func (*ActionBeginTransactionRequest) Descriptor() ([]byte, []int) { return file_FlightSql_proto_rawDescGZIP(), []int{15} } +// // Request message for the "BeginSavepoint" action. // Creates a savepoint within a transaction. // @@ -3557,6 +3627,7 @@ func (x *ActionBeginSavepointRequest) GetName() string { return "" } +// // The result of a "BeginTransaction" action. // // The transaction can be manipulated with the "EndTransaction" action, or @@ -3612,6 +3683,7 @@ func (x *ActionBeginTransactionResult) GetTransactionId() []byte { return nil } +// // The result of a "BeginSavepoint" action. // // The transaction can be manipulated with the "EndSavepoint" action. @@ -3667,6 +3739,7 @@ func (x *ActionBeginSavepointResult) GetSavepointId() []byte { return nil } +// // Request message for the "EndTransaction" action. // // Commit (COMMIT) or rollback (ROLLBACK) the transaction. @@ -3730,6 +3803,7 @@ func (x *ActionEndTransactionRequest) GetAction() ActionEndTransactionRequest_En return ActionEndTransactionRequest_END_TRANSACTION_UNSPECIFIED } +// // Request message for the "EndSavepoint" action. // // Release (RELEASE) the savepoint or rollback (ROLLBACK) to the @@ -3795,21 +3869,22 @@ func (x *ActionEndSavepointRequest) GetAction() ActionEndSavepointRequest_EndSav return ActionEndSavepointRequest_END_SAVEPOINT_UNSPECIFIED } +// // Represents a SQL query. Used in the command member of FlightDescriptor // for the following RPC calls: -// - GetSchema: return the Arrow schema of the query. -// Fields on this schema may contain the following metadata: -// - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name -// - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name -// - ARROW:FLIGHT:SQL:TABLE_NAME - Table name -// - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. -// - ARROW:FLIGHT:SQL:PRECISION - Column precision/size -// - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable -// - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. -// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. -// - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. -// - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. -// - GetFlightInfo: execute the query. +// - GetSchema: return the Arrow schema of the query. +// Fields on this schema may contain the following metadata: +// - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name +// - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name +// - ARROW:FLIGHT:SQL:TABLE_NAME - Table name +// - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. +// - ARROW:FLIGHT:SQL:PRECISION - Column precision/size +// - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable +// - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. +// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. +// - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. +// - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. +// - GetFlightInfo: execute the query. type CommandStatementQuery struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache @@ -3867,22 +3942,23 @@ func (x *CommandStatementQuery) GetTransactionId() []byte { return nil } +// // Represents a Substrait plan. Used in the command member of FlightDescriptor // for the following RPC calls: -// - GetSchema: return the Arrow schema of the query. -// Fields on this schema may contain the following metadata: -// - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name -// - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name -// - ARROW:FLIGHT:SQL:TABLE_NAME - Table name -// - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. -// - ARROW:FLIGHT:SQL:PRECISION - Column precision/size -// - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable -// - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. -// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. -// - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. -// - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. -// - GetFlightInfo: execute the query. -// - DoPut: execute the query. +// - GetSchema: return the Arrow schema of the query. +// Fields on this schema may contain the following metadata: +// - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name +// - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name +// - ARROW:FLIGHT:SQL:TABLE_NAME - Table name +// - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. +// - ARROW:FLIGHT:SQL:PRECISION - Column precision/size +// - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable +// - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. +// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. +// - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. +// - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. +// - GetFlightInfo: execute the query. +// - DoPut: execute the query. type CommandStatementSubstraitPlan struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache @@ -3940,7 +4016,7 @@ func (x *CommandStatementSubstraitPlan) GetTransactionId() []byte { return nil } -// * +//* // Represents a ticket resulting from GetFlightInfo with a CommandStatementQuery. // This should be used only once and treated as an opaque value, that is, clients should not attempt to parse this. type TicketStatementQuery struct { @@ -3991,22 +4067,23 @@ func (x *TicketStatementQuery) GetStatementHandle() []byte { return nil } +// // Represents an instance of executing a prepared statement. Used in the command member of FlightDescriptor for // the following RPC calls: -// - GetSchema: return the Arrow schema of the query. -// Fields on this schema may contain the following metadata: -// - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name -// - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name -// - ARROW:FLIGHT:SQL:TABLE_NAME - Table name -// - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. -// - ARROW:FLIGHT:SQL:PRECISION - Column precision/size -// - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable -// - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. -// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. -// - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. -// - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. -// - DoPut: bind parameter values. All of the bound parameter sets will be executed as a single atomic execution. -// - GetFlightInfo: execute the prepared statement instance. +// - GetSchema: return the Arrow schema of the query. +// Fields on this schema may contain the following metadata: +// - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name +// - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name +// - ARROW:FLIGHT:SQL:TABLE_NAME - Table name +// - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. +// - ARROW:FLIGHT:SQL:PRECISION - Column precision/size +// - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable +// - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. +// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. +// - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. +// - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. +// - DoPut: bind parameter values. All of the bound parameter sets will be executed as a single atomic execution. +// - GetFlightInfo: execute the prepared statement instance. type CommandPreparedStatementQuery struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache @@ -4055,6 +4132,7 @@ func (x *CommandPreparedStatementQuery) GetPreparedStatementHandle() []byte { return nil } +// // Represents a SQL update query. Used in the command member of FlightDescriptor // for the the RPC call DoPut to cause the server to execute the included SQL update. type CommandStatementUpdate struct { @@ -4114,6 +4192,7 @@ func (x *CommandStatementUpdate) GetTransactionId() []byte { return nil } +// // Represents a SQL update query. Used in the command member of FlightDescriptor // for the the RPC call DoPut to cause the server to execute the included // prepared statement handle as an update. @@ -4165,6 +4244,7 @@ func (x *CommandPreparedStatementUpdate) GetPreparedStatementHandle() []byte { return nil } +// // Returned from the RPC call DoPut when a CommandStatementUpdate // CommandPreparedStatementUpdate was in the request, containing // results from the update. @@ -4217,6 +4297,7 @@ func (x *DoPutUpdateResult) GetRecordCount() int64 { return 0 } +// // Request message for the "CancelQuery" action. // // Explicitly cancel a running query. @@ -4285,10 +4366,14 @@ func (x *ActionCancelQueryRequest) GetInfo() []byte { return nil } +// // The result of cancelling a query. // // The result should be wrapped in a google.protobuf.Any message. // +// This command is deprecated since 13.0.0. Use the "CancelFlightInfo" +// action with DoAction instead. +// // Deprecated: Do not use. type ActionCancelQueryResult struct { state protoimpl.MessageState @@ -4339,7 +4424,7 @@ func (x *ActionCancelQueryResult) GetResult() ActionCancelQueryResult_CancelResu var file_FlightSql_proto_extTypes = []protoimpl.ExtensionInfo{ { - ExtendedType: (*descriptorpb.MessageOptions)(nil), + ExtendedType: (*descriptor.MessageOptions)(nil), ExtensionType: (*bool)(nil), Field: 1000, Name: "arrow.flight.protocol.sql.experimental", @@ -4348,7 +4433,7 @@ var file_FlightSql_proto_extTypes = []protoimpl.ExtensionInfo{ }, } -// Extension fields to descriptorpb.MessageOptions. +// Extension fields to descriptor.MessageOptions. var ( // optional bool experimental = 1000; E_Experimental = &file_FlightSql_proto_extTypes[0] @@ -5163,7 +5248,7 @@ var file_FlightSql_proto_goTypes = []interface{}{ (*DoPutUpdateResult)(nil), // 52: arrow.flight.protocol.sql.DoPutUpdateResult (*ActionCancelQueryRequest)(nil), // 53: arrow.flight.protocol.sql.ActionCancelQueryRequest (*ActionCancelQueryResult)(nil), // 54: arrow.flight.protocol.sql.ActionCancelQueryResult - (*descriptorpb.MessageOptions)(nil), // 55: google.protobuf.MessageOptions + (*descriptor.MessageOptions)(nil), // 55: google.protobuf.MessageOptions } var file_FlightSql_proto_depIdxs = []int32{ 36, // 0: arrow.flight.protocol.sql.ActionCreatePreparedSubstraitPlanRequest.plan:type_name -> arrow.flight.protocol.sql.SubstraitPlan diff --git a/go/arrow/flight/internal/flight/Flight_grpc.pb.go b/go/arrow/flight/gen/flight/Flight_grpc.pb.go similarity index 93% rename from go/arrow/flight/internal/flight/Flight_grpc.pb.go rename to go/arrow/flight/gen/flight/Flight_grpc.pb.go index 10fd285a5c10b..9613114448796 100644 --- a/go/arrow/flight/internal/flight/Flight_grpc.pb.go +++ b/go/arrow/flight/gen/flight/Flight_grpc.pb.go @@ -1,4 +1,8 @@ // Code generated by protoc-gen-go-grpc. DO NOT EDIT. +// versions: +// - protoc-gen-go-grpc v1.2.0 +// - protoc v3.12.4 +// source: Flight.proto package flight @@ -11,17 +15,20 @@ import ( // This is a compile-time assertion to ensure that this generated file // is compatible with the grpc package it is being compiled against. +// Requires gRPC-Go v1.32.0 or later. const _ = grpc.SupportPackageIsVersion7 // FlightServiceClient is the client API for FlightService service. // // For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream. type FlightServiceClient interface { + // // Handshake between client and server. Depending on the server, the // handshake may be required to determine the token that should be used for // future operations. Both request and response are streams to allow multiple // round-trips depending on auth mechanism. Handshake(ctx context.Context, opts ...grpc.CallOption) (FlightService_HandshakeClient, error) + // // Get a list of available streams given a particular criteria. Most flight // services will expose one or more streams that are readily available for // retrieval. This api allows listing the streams available for @@ -29,6 +36,7 @@ type FlightServiceClient interface { // the subset of streams that can be listed via this interface. Each flight // service allows its own definition of how to consume criteria. ListFlights(ctx context.Context, in *Criteria, opts ...grpc.CallOption) (FlightService_ListFlightsClient, error) + // // For a given FlightDescriptor, get information about how the flight can be // consumed. This is a useful interface if the consumer of the interface // already can identify the specific flight to consume. This interface can @@ -40,16 +48,19 @@ type FlightServiceClient interface { // available for consumption for the duration defined by the specific flight // service. GetFlightInfo(ctx context.Context, in *FlightDescriptor, opts ...grpc.CallOption) (*FlightInfo, error) + // // For a given FlightDescriptor, get the Schema as described in Schema.fbs::Schema // This is used when a consumer needs the Schema of flight stream. Similar to // GetFlightInfo this interface may generate a new flight that was not previously // available in ListFlights. GetSchema(ctx context.Context, in *FlightDescriptor, opts ...grpc.CallOption) (*SchemaResult, error) + // // Retrieve a single stream associated with a particular descriptor // associated with the referenced ticket. A Flight can be composed of one or // more streams where each stream can be retrieved using a separate opaque // ticket that the flight service uses for managing a collection of streams. DoGet(ctx context.Context, in *Ticket, opts ...grpc.CallOption) (FlightService_DoGetClient, error) + // // Push a stream to the flight service associated with a particular // flight stream. This allows a client of a flight service to upload a stream // of data. Depending on the particular flight service, a client consumer @@ -57,12 +68,14 @@ type FlightServiceClient interface { // number. In the latter, the service might implement a 'seal' action that // can be applied to a descriptor once all streams are uploaded. DoPut(ctx context.Context, opts ...grpc.CallOption) (FlightService_DoPutClient, error) + // // Open a bidirectional data channel for a given descriptor. This // allows clients to send and receive arbitrary Arrow data and // application-specific metadata in a single logical stream. In // contrast to DoGet/DoPut, this is more suited for clients // offloading computation (rather than storage) to a Flight service. DoExchange(ctx context.Context, opts ...grpc.CallOption) (FlightService_DoExchangeClient, error) + // // Flight services can support an arbitrary number of simple actions in // addition to the possible ListFlights, GetFlightInfo, DoGet, DoPut // operations that are potentially available. DoAction allows a flight client @@ -70,6 +83,7 @@ type FlightServiceClient interface { // opaque request and response objects that are specific to the type action // being undertaken. DoAction(ctx context.Context, in *Action, opts ...grpc.CallOption) (FlightService_DoActionClient, error) + // // A flight service exposes all of the available action types that it has // along with descriptions. This allows different flight consumers to // understand the capabilities of the flight service. @@ -85,7 +99,7 @@ func NewFlightServiceClient(cc grpc.ClientConnInterface) FlightServiceClient { } func (c *flightServiceClient) Handshake(ctx context.Context, opts ...grpc.CallOption) (FlightService_HandshakeClient, error) { - stream, err := c.cc.NewStream(ctx, &_FlightService_serviceDesc.Streams[0], "/arrow.flight.protocol.FlightService/Handshake", opts...) + stream, err := c.cc.NewStream(ctx, &FlightService_ServiceDesc.Streams[0], "/arrow.flight.protocol.FlightService/Handshake", opts...) if err != nil { return nil, err } @@ -116,7 +130,7 @@ func (x *flightServiceHandshakeClient) Recv() (*HandshakeResponse, error) { } func (c *flightServiceClient) ListFlights(ctx context.Context, in *Criteria, opts ...grpc.CallOption) (FlightService_ListFlightsClient, error) { - stream, err := c.cc.NewStream(ctx, &_FlightService_serviceDesc.Streams[1], "/arrow.flight.protocol.FlightService/ListFlights", opts...) + stream, err := c.cc.NewStream(ctx, &FlightService_ServiceDesc.Streams[1], "/arrow.flight.protocol.FlightService/ListFlights", opts...) if err != nil { return nil, err } @@ -166,7 +180,7 @@ func (c *flightServiceClient) GetSchema(ctx context.Context, in *FlightDescripto } func (c *flightServiceClient) DoGet(ctx context.Context, in *Ticket, opts ...grpc.CallOption) (FlightService_DoGetClient, error) { - stream, err := c.cc.NewStream(ctx, &_FlightService_serviceDesc.Streams[2], "/arrow.flight.protocol.FlightService/DoGet", opts...) + stream, err := c.cc.NewStream(ctx, &FlightService_ServiceDesc.Streams[2], "/arrow.flight.protocol.FlightService/DoGet", opts...) if err != nil { return nil, err } @@ -198,7 +212,7 @@ func (x *flightServiceDoGetClient) Recv() (*FlightData, error) { } func (c *flightServiceClient) DoPut(ctx context.Context, opts ...grpc.CallOption) (FlightService_DoPutClient, error) { - stream, err := c.cc.NewStream(ctx, &_FlightService_serviceDesc.Streams[3], "/arrow.flight.protocol.FlightService/DoPut", opts...) + stream, err := c.cc.NewStream(ctx, &FlightService_ServiceDesc.Streams[3], "/arrow.flight.protocol.FlightService/DoPut", opts...) if err != nil { return nil, err } @@ -229,7 +243,7 @@ func (x *flightServiceDoPutClient) Recv() (*PutResult, error) { } func (c *flightServiceClient) DoExchange(ctx context.Context, opts ...grpc.CallOption) (FlightService_DoExchangeClient, error) { - stream, err := c.cc.NewStream(ctx, &_FlightService_serviceDesc.Streams[4], "/arrow.flight.protocol.FlightService/DoExchange", opts...) + stream, err := c.cc.NewStream(ctx, &FlightService_ServiceDesc.Streams[4], "/arrow.flight.protocol.FlightService/DoExchange", opts...) if err != nil { return nil, err } @@ -260,7 +274,7 @@ func (x *flightServiceDoExchangeClient) Recv() (*FlightData, error) { } func (c *flightServiceClient) DoAction(ctx context.Context, in *Action, opts ...grpc.CallOption) (FlightService_DoActionClient, error) { - stream, err := c.cc.NewStream(ctx, &_FlightService_serviceDesc.Streams[5], "/arrow.flight.protocol.FlightService/DoAction", opts...) + stream, err := c.cc.NewStream(ctx, &FlightService_ServiceDesc.Streams[5], "/arrow.flight.protocol.FlightService/DoAction", opts...) if err != nil { return nil, err } @@ -292,7 +306,7 @@ func (x *flightServiceDoActionClient) Recv() (*Result, error) { } func (c *flightServiceClient) ListActions(ctx context.Context, in *Empty, opts ...grpc.CallOption) (FlightService_ListActionsClient, error) { - stream, err := c.cc.NewStream(ctx, &_FlightService_serviceDesc.Streams[6], "/arrow.flight.protocol.FlightService/ListActions", opts...) + stream, err := c.cc.NewStream(ctx, &FlightService_ServiceDesc.Streams[6], "/arrow.flight.protocol.FlightService/ListActions", opts...) if err != nil { return nil, err } @@ -327,11 +341,13 @@ func (x *flightServiceListActionsClient) Recv() (*ActionType, error) { // All implementations must embed UnimplementedFlightServiceServer // for forward compatibility type FlightServiceServer interface { + // // Handshake between client and server. Depending on the server, the // handshake may be required to determine the token that should be used for // future operations. Both request and response are streams to allow multiple // round-trips depending on auth mechanism. Handshake(FlightService_HandshakeServer) error + // // Get a list of available streams given a particular criteria. Most flight // services will expose one or more streams that are readily available for // retrieval. This api allows listing the streams available for @@ -339,6 +355,7 @@ type FlightServiceServer interface { // the subset of streams that can be listed via this interface. Each flight // service allows its own definition of how to consume criteria. ListFlights(*Criteria, FlightService_ListFlightsServer) error + // // For a given FlightDescriptor, get information about how the flight can be // consumed. This is a useful interface if the consumer of the interface // already can identify the specific flight to consume. This interface can @@ -350,16 +367,19 @@ type FlightServiceServer interface { // available for consumption for the duration defined by the specific flight // service. GetFlightInfo(context.Context, *FlightDescriptor) (*FlightInfo, error) + // // For a given FlightDescriptor, get the Schema as described in Schema.fbs::Schema // This is used when a consumer needs the Schema of flight stream. Similar to // GetFlightInfo this interface may generate a new flight that was not previously // available in ListFlights. GetSchema(context.Context, *FlightDescriptor) (*SchemaResult, error) + // // Retrieve a single stream associated with a particular descriptor // associated with the referenced ticket. A Flight can be composed of one or // more streams where each stream can be retrieved using a separate opaque // ticket that the flight service uses for managing a collection of streams. DoGet(*Ticket, FlightService_DoGetServer) error + // // Push a stream to the flight service associated with a particular // flight stream. This allows a client of a flight service to upload a stream // of data. Depending on the particular flight service, a client consumer @@ -367,12 +387,14 @@ type FlightServiceServer interface { // number. In the latter, the service might implement a 'seal' action that // can be applied to a descriptor once all streams are uploaded. DoPut(FlightService_DoPutServer) error + // // Open a bidirectional data channel for a given descriptor. This // allows clients to send and receive arbitrary Arrow data and // application-specific metadata in a single logical stream. In // contrast to DoGet/DoPut, this is more suited for clients // offloading computation (rather than storage) to a Flight service. DoExchange(FlightService_DoExchangeServer) error + // // Flight services can support an arbitrary number of simple actions in // addition to the possible ListFlights, GetFlightInfo, DoGet, DoPut // operations that are potentially available. DoAction allows a flight client @@ -380,6 +402,7 @@ type FlightServiceServer interface { // opaque request and response objects that are specific to the type action // being undertaken. DoAction(*Action, FlightService_DoActionServer) error + // // A flight service exposes all of the available action types that it has // along with descriptions. This allows different flight consumers to // understand the capabilities of the flight service. @@ -427,8 +450,8 @@ type UnsafeFlightServiceServer interface { mustEmbedUnimplementedFlightServiceServer() } -func RegisterFlightServiceServer(s *grpc.Server, srv FlightServiceServer) { - s.RegisterService(&_FlightService_serviceDesc, srv) +func RegisterFlightServiceServer(s grpc.ServiceRegistrar, srv FlightServiceServer) { + s.RegisterService(&FlightService_ServiceDesc, srv) } func _FlightService_Handshake_Handler(srv interface{}, stream grpc.ServerStream) error { @@ -629,7 +652,10 @@ func (x *flightServiceListActionsServer) Send(m *ActionType) error { return x.ServerStream.SendMsg(m) } -var _FlightService_serviceDesc = grpc.ServiceDesc{ +// FlightService_ServiceDesc is the grpc.ServiceDesc for FlightService service. +// It's only intended for direct use with grpc.RegisterService, +// and not to be introspected or modified (even as a copy) +var FlightService_ServiceDesc = grpc.ServiceDesc{ ServiceName: "arrow.flight.protocol.FlightService", HandlerType: (*FlightServiceServer)(nil), Methods: []grpc.MethodDesc{ diff --git a/go/arrow/flight/server.go b/go/arrow/flight/server.go index 1dd02d0defaed..c5e64986d5f78 100644 --- a/go/arrow/flight/server.go +++ b/go/arrow/flight/server.go @@ -22,7 +22,7 @@ import ( "os" "os/signal" - "github.com/apache/arrow/go/v13/arrow/flight/internal/flight" + "github.com/apache/arrow/go/v13/arrow/flight/gen/flight" "google.golang.org/grpc" ) From eed5fe41a734da2d9806e369cbadc7894ba7f321 Mon Sep 17 00:00:00 2001 From: Dane Pitkin <48041712+danepitkin@users.noreply.github.com> Date: Wed, 26 Jul 2023 21:10:32 -0400 Subject: [PATCH 056/749] GH-36885: [Java][Docs] Add substrait dependency to maven build profiles (#36899) ### Rationale for this change The Java JNI dataset module recently included the Substrait module as a dependency. The dependency was added to the CI scripts, but not added to the build profiles and documentation yet. ### What changes are included in this PR? - Update maven build profiles - Update Java build documentation ### Are these changes tested? I tested locally on MacOS and was able to reproduce + fix with this change. ### Are there any user-facing changes? No * Closes: #36885 Authored-by: Dane Pitkin Signed-off-by: Sutou Kouhei --- docs/source/developers/java/building.rst | 2 ++ java/pom.xml | 2 ++ 2 files changed, 4 insertions(+) diff --git a/docs/source/developers/java/building.rst b/docs/source/developers/java/building.rst index 5b525d467731e..9d773f051451e 100644 --- a/docs/source/developers/java/building.rst +++ b/docs/source/developers/java/building.rst @@ -219,6 +219,7 @@ CMake -DARROW_ORC=ON \ -DARROW_PARQUET=ON \ -DARROW_S3=ON \ + -DARROW_SUBSTRAIT=ON \ -DARROW_USE_CCACHE=ON \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_INSTALL_LIBDIR=lib/ \ @@ -258,6 +259,7 @@ CMake -DARROW_ORC=OFF ^ -DARROW_PARQUET=ON ^ -DARROW_S3=ON ^ + -DARROW_SUBSTRAIT=ON ^ -DARROW_USE_CCACHE=ON ^ -DARROW_WITH_BROTLI=ON ^ -DARROW_WITH_LZ4=ON ^ diff --git a/java/pom.xml b/java/pom.xml index ccb2a2b72d5e6..e997406f0238d 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -1022,6 +1022,7 @@ -DARROW_ORC=${ARROW_ORC} -DARROW_PARQUET=${ARROW_PARQUET} -DARROW_S3=ON + -DARROW_SUBSTRAIT=${ARROW_DATASET} -DARROW_USE_CCACHE=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_LIBDIR=lib/${os.detected.arch} @@ -1131,6 +1132,7 @@ -DARROW_ORC=${ARROW_ORC} -DARROW_PARQUET=${ARROW_PARQUET} -DARROW_S3=ON + -DARROW_SUBSTRAIT=${ARROW_DATASET} -DARROW_USE_CCACHE=ON -DARROW_WITH_BROTLI=ON -DARROW_WITH_LZ4=ON From 5e1c112e93092bcbf62c45b51fd6629463332c4d Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Wed, 26 Jul 2023 18:13:43 -0800 Subject: [PATCH 057/749] MINOR: [FlightRPC][C++][Python][Docs] Minor tweaks to docstrings (#36900) ### Rationale for this change Improves language surrounding Flight Shutdown/Wait/Serve. ### What changes are included in this PR? Just docstring tweaks. ### Are these changes tested? No tests, just docstrings. ### Are there any user-facing changes? n/a Authored-by: Bryce Mecum Signed-off-by: Sutou Kouhei --- cpp/src/arrow/flight/server.h | 20 +++++++++++++------- python/pyarrow/_flight.pyx | 4 +++- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/flight/server.h b/cpp/src/arrow/flight/server.h index 5c15d8d5645d1..76f1a317068e6 100644 --- a/cpp/src/arrow/flight/server.h +++ b/cpp/src/arrow/flight/server.h @@ -202,8 +202,10 @@ class ARROW_FLIGHT_EXPORT FlightServerBase { Status SetShutdownOnSignals(const std::vector sigs); /// \brief Start serving. - /// This method blocks until either Shutdown() is called or one of the signals - /// registered in SetShutdownOnSignals() is received. + /// This method blocks until the server shuts down. + /// + /// The server will start to shut down when either Shutdown() is called + /// or one of the signals registered in SetShutdownOnSignals() is received. Status Serve(); /// \brief Query whether Serve() was interrupted by a signal. @@ -212,14 +214,18 @@ class ARROW_FLIGHT_EXPORT FlightServerBase { /// \return int the signal number that interrupted Serve(), if any, otherwise 0 int GotSignal() const; - /// \brief Shut down the server. Can be called from signal handler or another - /// thread while Serve() blocks. Optionally a deadline can be set. Once the - /// the deadline expires server will wait until remaining running calls - /// complete. + /// \brief Shut down the server, blocking until current requests finish. + /// + /// Can be called from a signal handler or another thread while Serve() + /// blocks. Optionally a deadline can be set. Once the the deadline expires + /// server will wait until remaining running calls complete. /// + /// Should only be called once. Status Shutdown(const std::chrono::system_clock::time_point* deadline = NULLPTR); - /// \brief Block until server is terminated with Shutdown. + /// \brief Block until server shuts down with Shutdown. + /// + /// Does not respond to signals like Serve(). Status Wait(); // Implement these methods to create your own server. The default diff --git a/python/pyarrow/_flight.pyx b/python/pyarrow/_flight.pyx index 6f5cd03cd56bf..c9f5526754e3d 100644 --- a/python/pyarrow/_flight.pyx +++ b/python/pyarrow/_flight.pyx @@ -3016,7 +3016,7 @@ cdef class FlightServerBase(_Weakrefable): def serve(self): """Block until the server shuts down. - This method only returns if shutdown() is called or a signal a + This method only returns if shutdown() is called or a signal is received. """ if self.server.get() == nullptr: @@ -3041,6 +3041,8 @@ cdef class FlightServerBase(_Weakrefable): method, as then the server will block forever waiting for that request to finish. Instead, call this method from a background thread. + + This method should only be called once. """ # Must not hold the GIL: shutdown waits for pending RPCs to # complete. Holding the GIL means Python-implemented Flight From 0f291473650ac0fa5f95bccb8af0d0b8029c4be2 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 27 Jul 2023 22:13:29 +0900 Subject: [PATCH 058/749] GH-36860: [C++] Report CMake error when system Protobuf exists but system gRPC doesn't exist (#36904) ### Rationale for this change We require system gRPC when system Protobuf is found to avoid underlying library (Abseil) mismatch. We should report it as an error on `cmake...` time instead of build time. ### What changes are included in this PR? Always call `find_package_handle_standard_args()` to report an error for the case. ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. * Closes: #36860 Authored-by: Sutou Kouhei Signed-off-by: David Li --- cpp/cmake_modules/FindgRPCAlt.cmake | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/cpp/cmake_modules/FindgRPCAlt.cmake b/cpp/cmake_modules/FindgRPCAlt.cmake index 81d8cf7ca68ee..2ff10dbc23dd2 100644 --- a/cpp/cmake_modules/FindgRPCAlt.cmake +++ b/cpp/cmake_modules/FindgRPCAlt.cmake @@ -57,13 +57,15 @@ if(GRPCPP_PC_FOUND) HINTS ${GRPCPP_PC_PREFIX} NO_DEFAULT_PATH PATH_SUFFIXES "bin") - set(gRPCAlt_FIND_PACKAGE_ARGS gRPCAlt REQUIRED_VARS GRPCPP_IMPORTED_LOCATION - GRPC_CPP_PLUGIN) - if(gRPCAlt_VERSION) - list(APPEND gRPCAlt_FIND_PACKAGE_ARGS VERSION_VAR gRPCAlt_VERSION) - endif() - find_package_handle_standard_args(${gRPCAlt_FIND_PACKAGE_ARGS}) +endif() +set(gRPCAlt_FIND_PACKAGE_ARGS gRPCAlt REQUIRED_VARS GRPCPP_IMPORTED_LOCATION + GRPC_CPP_PLUGIN) +if(gRPCAlt_VERSION) + list(APPEND gRPCAlt_FIND_PACKAGE_ARGS VERSION_VAR gRPCAlt_VERSION) +endif() +find_package_handle_standard_args(${gRPCAlt_FIND_PACKAGE_ARGS}) +if(gRPCAlt_FOUND) # gRPC does not expose the reflection library via pkg-config, but it should be alongside the main library get_filename_component(GRPCPP_IMPORTED_DIRECTORY ${GRPCPP_IMPORTED_LOCATION} DIRECTORY) if(ARROW_GRPC_USE_SHARED) @@ -77,11 +79,7 @@ if(GRPCPP_PC_FOUND) NAMES grpc++_reflection ${GRPCPP_REFLECTION_LIB_NAME} PATHS ${GRPCPP_IMPORTED_DIRECTORY} NO_DEFAULT_PATH) -else() - set(gRPCAlt_FOUND FALSE) -endif() -if(gRPCAlt_FOUND) add_library(gRPC::grpc++ UNKNOWN IMPORTED) set_target_properties(gRPC::grpc++ PROPERTIES IMPORTED_LOCATION "${GRPCPP_IMPORTED_LOCATION}" From bedc1e7cbb5d07469dd976350763ab4e93ed64ad Mon Sep 17 00:00:00 2001 From: Elliott Brossard <64754120+sfc-gh-ebrossard@users.noreply.github.com> Date: Thu, 27 Jul 2023 10:48:26 -0700 Subject: [PATCH 059/749] GH-36913: [C++] Skip empty buffer concatenation to fix UBSan error (#36914) ### Rationale for this change This is a trivial fix for a UBSan error in calls to `ConcatenateBuffers` with an empty buffer that has a null data pointer. ### What changes are included in this PR? Conditional call to `std::memcpy` based on whether the buffer's length is 0. ### Are these changes tested? Test added in buffer_test.cc. ### Are there any user-facing changes? No * Closes: #36913 Lead-authored-by: Elliott Brossard Co-authored-by: Elliott Brossard <64754120+sfc-gh-ebrossard@users.noreply.github.com> Co-authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/arrow/buffer.cc | 7 +++++-- cpp/src/arrow/buffer_test.cc | 9 +++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/buffer.cc b/cpp/src/arrow/buffer.cc index 135aa0c5328ad..99dc29cfe5296 100644 --- a/cpp/src/arrow/buffer.cc +++ b/cpp/src/arrow/buffer.cc @@ -213,8 +213,11 @@ Result> ConcatenateBuffers( ARROW_ASSIGN_OR_RAISE(auto out, AllocateBuffer(out_length, pool)); auto out_data = out->mutable_data(); for (const auto& buffer : buffers) { - std::memcpy(out_data, buffer->data(), buffer->size()); - out_data += buffer->size(); + // Passing nullptr to std::memcpy is undefined behavior, so skip empty buffers + if (buffer->size() != 0) { + std::memcpy(out_data, buffer->data(), buffer->size()); + out_data += buffer->size(); + } } return std::move(out); } diff --git a/cpp/src/arrow/buffer_test.cc b/cpp/src/arrow/buffer_test.cc index 3dd95cb8af5c6..13f6ea63b5e62 100644 --- a/cpp/src/arrow/buffer_test.cc +++ b/cpp/src/arrow/buffer_test.cc @@ -1014,4 +1014,13 @@ TYPED_TEST(TypedTestBuffer, ResizeOOM) { #endif } +TEST(TestBufferConcatenation, EmptyBuffer) { + // GH-36913: UB shouldn't be triggered by copying from a null pointer + const std::string contents = "hello, world"; + auto buffer = std::make_shared(contents); + auto empty_buffer = std::make_shared(/*data=*/nullptr, /*size=*/0); + ASSERT_OK_AND_ASSIGN(auto result, ConcatenateBuffers({buffer, empty_buffer})); + AssertMyBufferEqual(*result, contents); +} + } // namespace arrow From 5b744ab0e854f494f8df145b27dc30236df13221 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 28 Jul 2023 09:50:29 +0900 Subject: [PATCH 060/749] GH-36922: [CI][C++][Windows] Search OpenSSL from PATH (#36923) ### Rationale for this change It seems that OpenSSL install script adds the OpenSSL install folder to `PATH`: https://github.com/actions/runner-images/blob/665e71067ff126acea71e7d93715c83db038597f/images/win/scripts/Installers/Install-OpenSSL.ps1#L37 If OpenSSL install folder exists in `PATH`, we don't need to specify `OPENSSL_ROOT_DIR` explicitly. Because `find_*` such as [`find_library()`](https://cmake.org/cmake/help/latest/command/find_library.html) searches path in `PATH` by default: > On Windows hosts: `/lib/` if [`CMAKE_LIBRARY_ARCHITECTURE`](https://cmake.org/cmake/help/latest/variable/CMAKE_LIBRARY_ARCHITECTURE.html#variable:CMAKE_LIBRARY_ARCHITECTURE) is set, and `/lib` for each `/[s]bin` in `PATH`, and `/lib` for other entries in `PATH`. ### What changes are included in this PR? Remove `OPENSSL_ROOT_DIR`. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * Closes: #36922 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- .github/workflows/cpp.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index 63a16c8c114ba..cd12be11488bb 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -269,8 +269,6 @@ jobs: CMAKE_INSTALL_LIBDIR: bin CMAKE_INSTALL_PREFIX: /usr CMAKE_UNITY_BUILD: ON - OPENSSL_ROOT_DIR: >- - C:\Program Files\OpenSSL-Win64 NPROC: 3 steps: - name: Disable Crash Dialogs From e53a2e08a820c76aacfc6331855fd32a3073629c Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Fri, 28 Jul 2023 03:08:10 +0200 Subject: [PATCH 061/749] GH-36843: [Python][Docs] Add dict to docstring (#36842) ### Rationale for this change We can pass `dict[str, Expression]` as columns too. ### What changes are included in this PR? Add `dic[str, Expression]`. ### Are these changes tested? No. ### Are there any user-facing changes? Yes. * Closes: #36843 Lead-authored-by: Fokko Driesprong Co-authored-by: Fokko Driesprong Signed-off-by: Sutou Kouhei --- python/pyarrow/_dataset.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 925565804f63e..badf6e4a4c5dc 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -3337,7 +3337,7 @@ cdef class Scanner(_Weakrefable): ---------- dataset : Dataset Dataset to scan. - columns : list of str, default None + columns : list[str] or dict[str, Expression], default None The columns to project. This can be a list of column names to include (order and duplicates will be preserved), or a dictionary with {new_column_name: expression} values for more advanced @@ -3416,7 +3416,7 @@ cdef class Scanner(_Weakrefable): fragment to scan. schema : Schema, optional The schema of the fragment. - columns : list of str, default None + columns : list[str] or dict[str, Expression], default None The columns to project. This can be a list of column names to include (order and duplicates will be preserved), or a dictionary with {new_column_name: expression} values for more advanced @@ -3502,7 +3502,7 @@ cdef class Scanner(_Weakrefable): The iterator of Batches. schema : Schema The schema of the batches. - columns : list of str, default None + columns : list[str] or dict[str, Expression], default None The columns to project. This can be a list of column names to include (order and duplicates will be preserved), or a dictionary with {new_column_name: expression} values for more advanced From 40debb2149dbecf7e4bd94c3c77c6d5451b76b03 Mon Sep 17 00:00:00 2001 From: Dane Pitkin <48041712+danepitkin@users.noreply.github.com> Date: Thu, 27 Jul 2023 21:28:02 -0400 Subject: [PATCH 062/749] GH-36920: [Java][Docs] Add ARROW_JSON var to maven build profile (#36921) ### Rationale for this change JSON support in the datasets module was recently introduced. Update the maven build profile and build documentation to reflect this. Similar to https://github.com/apache/arrow/pull/36899 ### What changes are included in this PR? * update maven build profile * update build docs ### Are these changes tested? Yes, tested locally on M1 Mac. ### Are there any user-facing changes? No * Closes: #36920 Authored-by: Dane Pitkin Signed-off-by: Sutou Kouhei --- docs/source/developers/java/building.rst | 2 ++ java/pom.xml | 2 ++ 2 files changed, 4 insertions(+) diff --git a/docs/source/developers/java/building.rst b/docs/source/developers/java/building.rst index 9d773f051451e..5dd3a979d8293 100644 --- a/docs/source/developers/java/building.rst +++ b/docs/source/developers/java/building.rst @@ -216,6 +216,7 @@ CMake -DARROW_FILESYSTEM=ON \ -DARROW_GANDIVA=ON \ -DARROW_GANDIVA_STATIC_LIBSTDCPP=ON \ + -DARROW_JSON=ON \ -DARROW_ORC=ON \ -DARROW_PARQUET=ON \ -DARROW_S3=ON \ @@ -256,6 +257,7 @@ CMake -DARROW_DATASET=ON ^ -DARROW_DEPENDENCY_USE_SHARED=OFF ^ -DARROW_FILESYSTEM=ON ^ + -DARROW_JSON=ON ^ -DARROW_ORC=OFF ^ -DARROW_PARQUET=ON ^ -DARROW_S3=ON ^ diff --git a/java/pom.xml b/java/pom.xml index e997406f0238d..7f8de003a3355 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -1019,6 +1019,7 @@ -DARROW_FILESYSTEM=ON -DARROW_GANDIVA=${ARROW_GANDIVA} -DARROW_GANDIVA_STATIC_LIBSTDCPP=ON + -DARROW_JSON=${ARROW_DATASET} -DARROW_ORC=${ARROW_ORC} -DARROW_PARQUET=${ARROW_PARQUET} -DARROW_S3=ON @@ -1129,6 +1130,7 @@ -DARROW_DATASET=ON -DARROW_DEPENDENCY_USE_SHARED=OFF -DARROW_FILESYSTEM=ON + -DARROW_JSON=${ARROW_DATASET} -DARROW_ORC=${ARROW_ORC} -DARROW_PARQUET=${ARROW_PARQUET} -DARROW_S3=ON From 47e438adf74d0c38068d9fd31fca083c650209c8 Mon Sep 17 00:00:00 2001 From: h-vetinari Date: Fri, 28 Jul 2023 13:31:01 +1100 Subject: [PATCH 063/749] GH-15017: [Python] Harden test_memory.py for use with ARROW_USE_GLOG=ON (#36901) Accept output pattern for ARROW_USE_GLOG=ON too. * Closes: #15017 Lead-authored-by: H. Vetinari Co-authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- dev/tasks/conda-recipes/arrow-cpp/meta.yaml | 2 -- python/pyarrow/tests/test_memory.py | 10 ++++++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/dev/tasks/conda-recipes/arrow-cpp/meta.yaml b/dev/tasks/conda-recipes/arrow-cpp/meta.yaml index e61034c3075b3..ac4b29eb5ee7e 100644 --- a/dev/tasks/conda-recipes/arrow-cpp/meta.yaml +++ b/dev/tasks/conda-recipes/arrow-cpp/meta.yaml @@ -403,8 +403,6 @@ outputs: {% set tests_to_skip = tests_to_skip + " or test_safe_cast_from_float_with_nans_to_int" %} # [ppc64le] # gandiva tests are segfaulting on ppc {% set tests_to_skip = tests_to_skip + " or test_float_with_null_as_integer" %} # [ppc64le] - # "Unsupported backend 'nonexistent' specified in ARROW_DEFAULT_MEMORY_POOL" - {% set tests_to_skip = tests_to_skip + " or (test_memory and test_env_var)" %} # [unix] # test is broken; header is in $PREFIX, not $SP_DIR {% set tests_to_skip = tests_to_skip + " or (test_misc and test_get_include)" %} # [unix] # flaky tests that fail occasionally diff --git a/python/pyarrow/tests/test_memory.py b/python/pyarrow/tests/test_memory.py index 092c50de33b90..d9fdeb152c35e 100644 --- a/python/pyarrow/tests/test_memory.py +++ b/python/pyarrow/tests/test_memory.py @@ -134,8 +134,14 @@ def check_env_var(name, expected, *, expect_warning=False): res.check_returncode() # fail errlines = res.stderr.splitlines() if expect_warning: - assert len(errlines) == 1 - assert f"Unsupported backend '{name}'" in errlines[0] + assert len(errlines) in (1, 2) + if len(errlines) == 1: + # ARROW_USE_GLOG=OFF + assert f"Unsupported backend '{name}'" in errlines[0] + else: + # ARROW_USE_GLOG=ON + assert "InitGoogleLogging()" in errlines[0] + assert f"Unsupported backend '{name}'" in errlines[1] else: assert len(errlines) == 0 From 8c4941b879670ec9dc3608b77637f44fddf32178 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 28 Jul 2023 09:34:29 +0100 Subject: [PATCH 064/749] GH-36883: [R] Remove version number which triggers CRAN warning (#36884) ### What changes are included in this PR? Updates package version number to be character not numeric ### Are these changes tested? No, is configure scripts ### Are there any user-facing changes? No * Closes: #36883 Authored-by: Nic Crane Signed-off-by: Nic Crane --- r/tools/nixlibs.R | 2 +- r/tools/winlibs.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index 8b353fd09bb5a..90ea868ea3491 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -30,7 +30,7 @@ if (test_mode && is.na(VERSION)) { dev_version <- package_version(VERSION)[1, 4] # Small dev versions are added for R-only changes during CRAN submission. -if (is.na(dev_version) || dev_version < 100) { +if (is.na(dev_version) || dev_version < "100") { VERSION <- package_version(VERSION)[1, 1:3] arrow_repo <- paste0(getOption("arrow.repo", sprintf("https://apache.jfrog.io/artifactory/arrow/r/%s", VERSION)), "/libarrow/") } else { diff --git a/r/tools/winlibs.R b/r/tools/winlibs.R index d941da4baa61f..b554770e40c9b 100644 --- a/r/tools/winlibs.R +++ b/r/tools/winlibs.R @@ -53,7 +53,7 @@ if (!file.exists(sprintf("windows/arrow-%s/include/arrow/api.h", VERSION))) { dev_version <- package_version(VERSION)[1, 4] # Small dev versions are added for R-only changes during CRAN submission. - if (is.na(dev_version) || dev_version < 100) { + if (is.na(dev_version) || dev_version < "100") { VERSION <- package_version(VERSION)[1, 1:3] get_file(rwinlib, VERSION) From 6b1c7234f7e7fd1d0b543a0513deb3f79f1af131 Mon Sep 17 00:00:00 2001 From: panbingkun <84731559@qq.com> Date: Fri, 28 Jul 2023 22:47:42 +0800 Subject: [PATCH 065/749] GH-36928: [Java] Make it run well with the netty newest version 4.1.96 (#36926) When I used `netty arrow memory 13.0.0` and `netty 4.1.96.Final` in Spark, the following error occurred, Because `netty 4.1.96.Final` version has revert some modifications, in order to ensure that `netty arrow memory 13.0.0` works well with ``netty 4.1.96.Final`` version, I suggest making similar modifications here. 1.Compilation errors are as follows: https://ci.appveyor.com/project/ApacheSoftwareFoundation/spark/builds/47657403 image 2.Some modifications have been reverted in `netty 4.1.96.Final` as follows: image image https://netty.io/news/2023/07/27/4-1-96-Final.html https://github.com/netty/netty/pull/13510 * Closes: #36928 Authored-by: panbingkun Signed-off-by: David Li --- .../src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java | 2 +- java/pom.xml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/java/memory/memory-netty/src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java b/java/memory/memory-netty/src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java index 870114d7db1b5..06c6669cfd162 100644 --- a/java/memory/memory-netty/src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java +++ b/java/memory/memory-netty/src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java @@ -161,7 +161,7 @@ public InnerAllocator() { } private UnsafeDirectLittleEndian newDirectBufferL(int initialCapacity, int maxCapacity) { - PoolArenasCache cache = threadCache(); + PoolThreadCache cache = threadCache(); PoolArena directArena = cache.directArena; if (directArena != null) { diff --git a/java/pom.xml b/java/pom.xml index 7f8de003a3355..d6b37c7df969d 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -33,7 +33,7 @@ 5.9.0 1.7.25 31.1-jre - 4.1.94.Final + 4.1.96.Final 1.56.0 3.23.1 2.15.1 From 2a9c8b1bda41cb87fb211ad209347ca2259253aa Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Mon, 31 Jul 2023 06:11:06 +0900 Subject: [PATCH 066/749] GH-36837: [CI][RPM] Use multi-cores to install gems (#36838) ### Rationale for this change We may reduce test time by using multi-cores to install gems. ### What changes are included in this PR? `gem install` with `MAKEFLAGS=-j$(nproc)` uses multi-cores when building extension libraries. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * Closes: #36837 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- dev/release/verify-yum.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/release/verify-yum.sh b/dev/release/verify-yum.sh index 03aa8e9dccc75..55fc0c1735931 100755 --- a/dev/release/verify-yum.sh +++ b/dev/release/verify-yum.sh @@ -234,7 +234,7 @@ if [ "${have_glib}" = "yes" ]; then if [ "${have_ruby}" = "yes" ]; then ${install_command} "${ruby_devel_packages[@]}" - gem install gobject-introspection + MAKEFLAGS="-j$(nproc)" gem install gobject-introspection ruby -r gi -e "p GI.load('Arrow')" fi echo "::endgroup::" From af23f6a2e8ece6211b087b0e4f24b9daaffbb8a9 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Mon, 31 Jul 2023 06:11:32 +0900 Subject: [PATCH 067/749] GH-36685: [R][C++] Fix illegal opcode failure with Homebrew (#36705) ### Rationale for this change Summary of this problem: https://github.com/apache/arrow/issues/31132#issuecomment-1378082076 Why this problem is happen again? Because I removed `ENV["HOMEBREW_OPTIMIZATION_LEVEL"] = "O2"` in #36583. The solution we chose by #14342 was forcing to use `-O2` for SIMD related code. It works for `-DCMAKE_BUILD_TYPE=MinSizeRel` but it doesn't work for Homebrew. Because Homebrew's CC https://github.com/Homebrew/brew/blob/master/Library/Homebrew/shims/super/cc forces to use the same `-O` flag. The default is `-Os`. If we specify `-O2`, Homebrew's CC replaces it to `-Os`. If we use `ENV["HOMEBREW_OPTIMIZATION_LEVEL"] = "O2"`, Homebrew's CC always use `-O2`. So the solution we chose by #14342 isn't used for Homebrew. But Homebrew thinks that `ENV["HOMEBREW_OPTIMIZATION_LEVEL"] = "O2"` is a workaround. So we need another solution for Homebrew. Here are candidate solutions: 1. `-DARROW_RUNTIME_SIMD_LEVEL=NONE` 2. Remove `ENV.runtime_cpu_detection if Hardware::CPU.intel?` "1. `-DARROW_RUNTIME_SIMD_LEVEL=NONE`" works because we don't use the runtime SIMD dispatch feature (the problematic feature) entirely. "2. Remove `ENV.runtime_cpu_detection if Hardware::CPU.intel?`" works but I don't know why... If `ENV.runtime_cpu_detection` is called, Homebrew's CC stops replacing `-march=*`. If we call `ENV.runtime_cpu_detection`, `-march=haswell` is used for AVX2 related code and `-march=skylake-avx512` is used for AVX512 including BMI2 related code. If we don't call `ENV.runtime_cpu_detection`, `-march=nehalem` is always used. (Note that SIMD related flags such as `-mbmi2` aren't removed by Homebrew's CC. So I think that SIMD is enabled.) I don't know why but "the one-definition-rule violation" (see the summary for details: https://github.com/apache/arrow/issues/31132#issuecomment-1378082076 ) isn't happen. FYI: CPU info for GitHub Actions macOS hosted-runner: ```console $ sysctl hw.optional machdep.cpu hw.optional.adx: 0 hw.optional.aes: 1 hw.optional.avx1_0: 1 hw.optional.avx2_0: 0 hw.optional.avx512bw: 0 hw.optional.avx512cd: 0 hw.optional.avx512dq: 0 hw.optional.avx512f: 0 hw.optional.avx512ifma: 0 hw.optional.avx512vbmi: 0 hw.optional.avx512vl: 0 hw.optional.bmi1: 0 hw.optional.bmi2: 0 hw.optional.enfstrg: 0 hw.optional.f16c: 1 hw.optional.floatingpoint: 1 hw.optional.fma: 0 hw.optional.hle: 0 hw.optional.mmx: 1 hw.optional.mpx: 0 hw.optional.rdrand: 1 hw.optional.rtm: 0 hw.optional.sgx: 0 hw.optional.sse: 1 hw.optional.sse2: 1 hw.optional.sse3: 1 hw.optional.sse4_1: 1 hw.optional.sse4_2: 1 hw.optional.supplementalsse3: 1 hw.optional.x86_64: 1 machdep.cpu.address_bits.physical: 43 machdep.cpu.address_bits.virtual: 48 machdep.cpu.arch_perf.events: 127 machdep.cpu.arch_perf.events_number: 7 machdep.cpu.arch_perf.fixed_number: 0 machdep.cpu.arch_perf.fixed_width: 0 machdep.cpu.arch_perf.number: 4 machdep.cpu.arch_perf.version: 1 machdep.cpu.arch_perf.width: 48 machdep.cpu.cache.L2_associativity: 8 machdep.cpu.cache.linesize: 64 machdep.cpu.cache.size: 256 machdep.cpu.mwait.extensions: 3 machdep.cpu.mwait.linesize_max: 4096 machdep.cpu.mwait.linesize_min: 64 machdep.cpu.mwait.sub_Cstates: 16 machdep.cpu.thermal.ACNT_MCNT: 0 machdep.cpu.thermal.core_power_limits: 0 machdep.cpu.thermal.dynamic_acceleration: 0 machdep.cpu.thermal.energy_policy: 0 machdep.cpu.thermal.fine_grain_clock_mod: 0 machdep.cpu.thermal.hardware_feedback: 0 machdep.cpu.thermal.invariant_APIC_timer: 1 machdep.cpu.thermal.package_thermal_intr: 0 machdep.cpu.thermal.sensor: 0 machdep.cpu.thermal.thresholds: 0 machdep.cpu.tlb.data.small: 64 machdep.cpu.tlb.inst.large: 8 machdep.cpu.tlb.inst.small: 64 machdep.cpu.tlb.shared: 512 machdep.cpu.tsc_ccc.denominator: 0 machdep.cpu.tsc_ccc.numerator: 0 machdep.cpu.xsave.extended_state: 7 832 832 0 machdep.cpu.xsave.extended_state1: 0 0 0 0 machdep.cpu.brand: 0 machdep.cpu.brand_string: Intel(R) Xeon(R) CPU E5-1650 v2 @ 3.50GHz machdep.cpu.core_count: 3 machdep.cpu.cores_per_package: 4 machdep.cpu.extfamily: 0 machdep.cpu.extfeature_bits: 4967106816 machdep.cpu.extfeatures: SYSCALL XD EM64T LAHF RDTSCP TSCI machdep.cpu.extmodel: 3 machdep.cpu.family: 6 machdep.cpu.feature_bits: 18427078393948011519 machdep.cpu.features: FPU VME DE PSE TSC MSR PAE MCE CX8 APIC SEP MTRR PGE MCA CMOV PAT PSE36 CLFSH MMX FXSR SSE SSE2 SS HTT SSE3 PCLMULQDQ MON VMX SSSE3 CX16 SSE4.1 SSE4.2 x2APIC POPCNT AES VMM PCID XSAVE OSXSAVE TSCTMR AVX1.0 RDRAND F16C machdep.cpu.leaf7_feature_bits: 643 0 machdep.cpu.leaf7_feature_bits_edx: 3154117632 machdep.cpu.leaf7_features: RDWRFSGS TSC_THREAD_OFFSET SMEP ERMS MDCLEAR IBRS STIBP L1DF ACAPMSR SSBD machdep.cpu.logical_per_package: 4 machdep.cpu.max_basic: 13 machdep.cpu.max_ext: 2147483656 machdep.cpu.microcode_version: 1070 machdep.cpu.model: 58 machdep.cpu.processor_flag: 0 machdep.cpu.signature: 198313 machdep.cpu.stepping: 9 machdep.cpu.thread_count: 3 machdep.cpu.vendor: GenuineIntel ``` ### What changes are included in this PR? "1. `-DARROW_RUNTIME_SIMD_LEVEL=NONE`" because it's straightforward and "2. Remove `ENV.runtime_cpu_detection if Hardware::CPU.intel?`" may also disable runtime SIMD dispatch implicitly. This also adds the following debug information for easy to debug in future: * CPU information for GitHub Actions runner * Homebrew's build logs ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * Closes: #36685 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- dev/tasks/homebrew-formulae/apache-arrow.rb | 8 +++++++- dev/tasks/macros.jinja | 4 +++- dev/tasks/r/github.macos.brew.yml | 20 ++++++++++++++++---- 3 files changed, 26 insertions(+), 6 deletions(-) diff --git a/dev/tasks/homebrew-formulae/apache-arrow.rb b/dev/tasks/homebrew-formulae/apache-arrow.rb index 9086fdaae4ab4..f5d7ff36ea041 100644 --- a/dev/tasks/homebrew-formulae/apache-arrow.rb +++ b/dev/tasks/homebrew-formulae/apache-arrow.rb @@ -57,7 +57,8 @@ class ApacheArrow < Formula fails_with gcc: "5" def install - # https://github.com/Homebrew/homebrew-core/issues/76537 + # This isn't for https://github.com/Homebrew/homebrew-core/issues/76537 . + # This may improve performance. ENV.runtime_cpu_detection if Hardware::CPU.intel? # link against system libc++ instead of llvm provided libc++ @@ -90,6 +91,11 @@ def install -DARROW_WITH_ZSTD=ON -DPARQUET_BUILD_EXECUTABLES=ON ] + # Disable runtime SIMD dispatch. It may cause "illegal opcode" + # error on Intel Mac because of one-definition-rule violation. + # + # https://github.com/apache/arrow/issues/36685 + args << "-DARROW_RUNTIME_SIMD_LEVEL=NONE" if OS.mac? and Hardware::CPU.intel? system "cmake", "-S", "cpp", "-B", "build", *args, *std_cmake_args system "cmake", "--build", "build" diff --git a/dev/tasks/macros.jinja b/dev/tasks/macros.jinja index 475494af18ce6..1f2f9a72975eb 100644 --- a/dev/tasks/macros.jinja +++ b/dev/tasks/macros.jinja @@ -214,6 +214,7 @@ on: # see https://github.com/actions/runner-images/issues/6868 brew install --overwrite python@3.11 python@3.10 + set -x ARROW_GLIB_FORMULA=$(echo ${ARROW_FORMULA} | sed -e 's/\.rb/-glib.rb/') echo "ARROW_GLIB_FORMULA=${ARROW_GLIB_FORMULA}" >> ${GITHUB_ENV} for formula in ${ARROW_FORMULA} ${ARROW_GLIB_FORMULA}; do @@ -223,11 +224,12 @@ on: # Pin the current commit in the formula to test so that # we're not always pulling from the tip of the default branch sed -i '' -E \ - -e 's@https://github.com/apache/arrow.git"$@{{ arrow.remote }}.git", revision: "{{ arrow.head }}"@' \ + -e 's@https://github.com/apache/arrow.git", branch: "main"$@{{ arrow.remote }}.git", revision: "{{ arrow.head }}"@' \ ${formula} # Sometimes crossbow gives a remote URL with .git and sometimes not. # Make sure there's only one sed -i '' -E -e 's@.git.git@.git@' ${formula} + cat ${formula} cp ${formula} $(brew --repository homebrew/core)/Formula/ done {% endmacro %} diff --git a/dev/tasks/r/github.macos.brew.yml b/dev/tasks/r/github.macos.brew.yml index dea7564c5f80f..8a0e6f6bb96e3 100644 --- a/dev/tasks/r/github.macos.brew.yml +++ b/dev/tasks/r/github.macos.brew.yml @@ -24,21 +24,33 @@ jobs: name: "Homebrew + R package" runs-on: macOS-11 steps: + - name: Show system information + run: | + sysctl hw.optional machdep.cpu + {{ macros.github_checkout_arrow()|indent }} {{ macros.configure_homebrew_arrow(formula)|indent }} - name: Install apache-arrow env: - {{ macros.github_set_sccache_envvars()|indent(8)}} + {{ macros.github_set_sccache_envvars()|indent(8)}} run: | - brew install sccache # for testing brew install minio - + # TODO(ARROW-16907): apache/arrow@main seems to be installed already # so this does nothing on a branch/PR - brew install -v --HEAD apache-arrow + brew install -v --HEAD {{ '$(brew --repository homebrew/core)/Formula/apache-arrow.rb' }} + + mkdir -p homebrew-logs + cp -a ~/Library/Logs/Homebrew/apache-arrow homebrew-logs/ + - name: Save logs + if: always() + uses: actions/upload-artifact@v2 + with: + name: homebrew-logs + path: homebrew-logs - uses: r-lib/actions/setup-r@v2 - name: Install dependencies From eddb2b960024c1c8f1c384755bcb2488f116dca2 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Mon, 31 Jul 2023 14:06:45 +0900 Subject: [PATCH 068/749] GH-36944: [C++] Unify OpenSSL detection for building GCS (#36945) ### Rationale for this change `build_google_cloud_cpp_storage()` calls `resolve_dependency(OpenSSL)` but it should not be here. We should have only one `resolve_dependency(OpenSSL)` for easy to maintain. ### What changes are included in this PR? Don't call `resolve_dependency(OpenSSL)` from `build_google_cloud_cpp_storage()`. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * Closes: #36944 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 635bc1684e6f2..1e7840cf92e08 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -1368,8 +1368,9 @@ set(ARROW_OPENSSL_REQUIRED_VERSION "1.0.2") set(ARROW_USE_OPENSSL OFF) if(PARQUET_REQUIRE_ENCRYPTION OR ARROW_FLIGHT - OR ARROW_S3 - OR ARROW_GANDIVA) + OR ARROW_GANDIVA + OR ARROW_GCS + OR ARROW_S3) set(OpenSSL_SOURCE "SYSTEM") resolve_dependency(OpenSSL HAVE_ALT @@ -4106,10 +4107,6 @@ macro(build_google_cloud_cpp_storage) # Curl is required on all platforms, but building it internally might also trip over S3's copy. # For now, force its inclusion from the underlying system or fail. find_curl() - if(NOT OpenSSL_FOUND) - resolve_dependency(OpenSSL HAVE_ALT REQUIRED_VERSION - ${ARROW_OPENSSL_REQUIRED_VERSION}) - endif() # Build google-cloud-cpp, with only storage_client From a06b2618420ef89431373a9e8f07a5da64d546a5 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Mon, 31 Jul 2023 15:27:33 +0900 Subject: [PATCH 069/749] GH-36941: [CI][Docs] Use system Protobuf (#36943) ### Rationale for this change We can reduce disk usage by using system Protobuf and gRPC. ### What changes are included in this PR? Use system Protobuf. We can use Protobuf 3.12.0 or later by #35962. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * Closes: #36941 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- dev/tasks/tasks.yml | 8 ++++++-- docker-compose.yml | 1 - 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 05dafade97434..25bda38809e3b 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -64,7 +64,7 @@ groups: - r-binary-packages - ubuntu-* - wheel-* - - test-ubuntu-default-docs + - test-ubuntu-*-docs {############################# Testing tasks #################################} @@ -1498,10 +1498,12 @@ tasks: image: debian-go {% endfor %} - test-ubuntu-default-docs: + test-ubuntu-22.04-docs: ci: github template: docs/github.linux.yml params: + env: + UBUNTU: 22.04 pr_number: Unset flags: "-v $PWD/build/:/build/" image: ubuntu-docs @@ -1625,6 +1627,8 @@ tasks: ci: github template: docs/github.linux.yml params: + env: + UBUNTU: 22.04 pr_number: Unset flags: "-v $PWD/build/:/build/" image: ubuntu-docs diff --git a/docker-compose.yml b/docker-compose.yml index 8727aded2c825..fe98a30d0b92b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1745,7 +1745,6 @@ services: BUILD_DOCS_JS: "ON" BUILD_DOCS_PYTHON: "ON" BUILD_DOCS_R: "ON" - Protobuf_SOURCE: "BUNDLED" # Need Protobuf >= 3.15 volumes: *ubuntu-volumes command: &docs-command > /bin/bash -c " From 37cb59240b1fa4c5b8e596afdaebf9435c415cec Mon Sep 17 00:00:00 2001 From: David Li Date: Mon, 31 Jul 2023 16:33:28 -0400 Subject: [PATCH 070/749] GH-36952: [C++][FlightRPC][Python] Add methods to send headers (#36956) ### Rationale for this change Sending headers/trailers is required for services, but you couldn't do this before. ### What changes are included in this PR? Add new methods to directly send headers/trailers. ### Are these changes tested? Yes ### Are there any user-facing changes? Yes (new APIs) * Closes: #36952 Authored-by: David Li Signed-off-by: Sutou Kouhei --- cpp/src/arrow/flight/client_middleware.h | 5 ++ cpp/src/arrow/flight/server.h | 9 ++ cpp/src/arrow/flight/test_definitions.cc | 87 +++++++++++++++++-- cpp/src/arrow/flight/test_definitions.h | 9 +- .../flight/transport/grpc/grpc_client.cc | 18 +--- .../flight/transport/grpc/grpc_server.cc | 9 ++ .../ucx/flight_transport_ucx_test.cc | 2 + .../arrow/flight/transport/ucx/ucx_server.cc | 3 + python/pyarrow/_flight.pyx | 8 ++ python/pyarrow/includes/libarrow_flight.pxd | 2 + python/pyarrow/tests/test_flight.py | 44 +++++++++- 11 files changed, 174 insertions(+), 22 deletions(-) diff --git a/cpp/src/arrow/flight/client_middleware.h b/cpp/src/arrow/flight/client_middleware.h index 5b67e784b9eda..8e3126553a953 100644 --- a/cpp/src/arrow/flight/client_middleware.h +++ b/cpp/src/arrow/flight/client_middleware.h @@ -42,6 +42,11 @@ class ARROW_FLIGHT_EXPORT ClientMiddleware { virtual void SendingHeaders(AddCallHeaders* outgoing_headers) = 0; /// \brief A callback when headers are received from the server. + /// + /// This may be called more than once, since servers send both + /// headers and trailers. Some implementations (e.g. gRPC-Java, and + /// hence Arrow Flight in Java) may consolidate headers into + /// trailers if the RPC errored. virtual void ReceivedHeaders(const CallHeaders& incoming_headers) = 0; /// \brief A callback after the call has completed. diff --git a/cpp/src/arrow/flight/server.h b/cpp/src/arrow/flight/server.h index 76f1a317068e6..049c6cee3ffcf 100644 --- a/cpp/src/arrow/flight/server.h +++ b/cpp/src/arrow/flight/server.h @@ -122,6 +122,15 @@ class ARROW_FLIGHT_EXPORT ServerCallContext { virtual const std::string& peer_identity() const = 0; /// \brief The peer address (not validated) virtual const std::string& peer() const = 0; + /// \brief Add a response header. This is only valid before the server + /// starts sending the response; generally this isn't an issue unless you + /// are implementing FlightDataStream, ResultStream, or similar interfaces + /// yourself, or during a DoExchange or DoPut. + virtual void AddHeader(const std::string& key, const std::string& value) const = 0; + /// \brief Add a response trailer. This is only valid before the server + /// sends the final status; generally this isn't an issue unless your RPC + /// handler launches a thread or similar. + virtual void AddTrailer(const std::string& key, const std::string& value) const = 0; /// \brief Look up a middleware by key. Do not maintain a reference /// to the object beyond the request body. /// \return The middleware, or nullptr if not found. diff --git a/cpp/src/arrow/flight/test_definitions.cc b/cpp/src/arrow/flight/test_definitions.cc index 507c5ef40421c..4e137380044f3 100644 --- a/cpp/src/arrow/flight/test_definitions.cc +++ b/cpp/src/arrow/flight/test_definitions.cc @@ -18,17 +18,22 @@ #include "arrow/flight/test_definitions.h" #include +#include +#include #include "arrow/array/array_base.h" #include "arrow/array/array_dict.h" #include "arrow/array/util.h" #include "arrow/flight/api.h" +#include "arrow/flight/client_middleware.h" #include "arrow/flight/test_util.h" #include "arrow/table.h" #include "arrow/testing/generator.h" +#include "arrow/testing/gtest_util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/config.h" #include "arrow/util/logging.h" +#include "gmock/gmock.h" #if defined(ARROW_CUDA) #include "arrow/gpu/cuda_api.h" @@ -1438,20 +1443,26 @@ class ErrorHandlingTestServer : public FlightServerBase { public: Status GetFlightInfo(const ServerCallContext& context, const FlightDescriptor& request, std::unique_ptr* info) override { - if (request.path.size() >= 2) { + if (request.path.size() == 1 && request.path[0] == "metadata") { + context.AddHeader("x-header", "header-value"); + context.AddHeader("x-header-bin", "header\x01value"); + context.AddTrailer("x-trailer", "trailer-value"); + context.AddTrailer("x-trailer-bin", "trailer\x01value"); + return Status::Invalid("Expected"); + } else if (request.path.size() >= 2) { const int raw_code = std::atoi(request.path[0].c_str()); ARROW_ASSIGN_OR_RAISE(StatusCode code, TryConvertStatusCode(raw_code)); if (request.path.size() == 2) { - return Status(code, request.path[1]); + return {code, request.path[1]}; } else if (request.path.size() == 3) { - return Status(code, request.path[1], std::make_shared()); + return {code, request.path[1], std::make_shared()}; } else { const int raw_code = std::atoi(request.path[2].c_str()); ARROW_ASSIGN_OR_RAISE(FlightStatusCode flight_code, TryConvertFlightStatusCode(raw_code)); - return Status(code, request.path[1], - std::make_shared(flight_code, request.path[3])); + return {code, request.path[1], + std::make_shared(flight_code, request.path[3])}; } } return Status::NotImplemented("NYI"); @@ -1469,20 +1480,70 @@ class ErrorHandlingTestServer : public FlightServerBase { return MakeFlightError(FlightStatusCode::Unauthorized, "Unauthorized", "extra info"); } }; + +class MetadataRecordingClientMiddleware : public ClientMiddleware { + public: + explicit MetadataRecordingClientMiddleware( + std::mutex& mutex, std::vector>& headers) + : mutex_(mutex), headers_(headers) {} + void SendingHeaders(AddCallHeaders*) override {} + void ReceivedHeaders(const CallHeaders& incoming_headers) override { + std::lock_guard guard(mutex_); + for (const auto& [key, value] : incoming_headers) { + headers_.emplace_back(key, value); + } + } + void CallCompleted(const Status&) override {} + + private: + std::mutex& mutex_; + std::vector>& headers_; +}; + +class MetadataRecordingClientMiddlewareFactory : public ClientMiddlewareFactory { + public: + void StartCall(const CallInfo&, + std::unique_ptr* middleware) override { + *middleware = std::make_unique(mutex_, headers_); + } + + std::vector> GetHeaders() const { + std::lock_guard guard(mutex_); + // Take copy + return headers_; + } + + private: + mutable std::mutex mutex_; + std::vector> headers_; +}; } // namespace +struct ErrorHandlingTest::Impl { + std::shared_ptr metadata = + std::make_shared(); +}; + void ErrorHandlingTest::SetUpTest() { + impl_ = std::make_shared(); ASSERT_OK_AND_ASSIGN(auto location, Location::ForScheme(transport(), "127.0.0.1", 0)); ASSERT_OK(MakeServer( location, &server_, &client_, [](FlightServerOptions* options) { return Status::OK(); }, - [](FlightClientOptions* options) { return Status::OK(); })); + [&](FlightClientOptions* options) { + options->middleware.emplace_back(impl_->metadata); + return Status::OK(); + })); } void ErrorHandlingTest::TearDownTest() { ASSERT_OK(client_->Close()); ASSERT_OK(server_->Shutdown()); } +std::vector> ErrorHandlingTest::GetHeaders() { + return impl_->metadata->GetHeaders(); +} + void ErrorHandlingTest::TestGetFlightInfo() { std::unique_ptr info; for (const auto code : kStatusCodes) { @@ -1518,6 +1579,20 @@ void ErrorHandlingTest::TestGetFlightInfo() { } } +void ErrorHandlingTest::TestGetFlightInfoMetadata() { + auto descr = FlightDescriptor::Path({"metadata"}); + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("Expected"), + client_->GetFlightInfo(descr)); + // This is janky because we don't/can't expose grpc::CallContext. + // See https://github.com/apache/arrow/issues/34607 + ASSERT_THAT(GetHeaders(), ::testing::IsSupersetOf({ + std::make_pair("x-header", "header-value"), + std::make_pair("x-header-bin", "header\x01value"), + std::make_pair("x-trailer", "trailer-value"), + std::make_pair("x-trailer-bin", "trailer\x01value"), + })); +} + void CheckErrorDetail(const Status& status) { auto detail = FlightStatusDetail::UnwrapStatus(status); ASSERT_NE(detail, nullptr) << status.ToString(); diff --git a/cpp/src/arrow/flight/test_definitions.h b/cpp/src/arrow/flight/test_definitions.h index 7a7f905f3e97c..c73bc264b4966 100644 --- a/cpp/src/arrow/flight/test_definitions.h +++ b/cpp/src/arrow/flight/test_definitions.h @@ -265,10 +265,16 @@ class ARROW_FLIGHT_EXPORT ErrorHandlingTest : public FlightTest { // Test methods void TestGetFlightInfo(); + void TestGetFlightInfoMetadata(); void TestDoPut(); void TestDoExchange(); - private: + protected: + struct Impl; + + std::vector> GetHeaders(); + + std::shared_ptr impl_; std::unique_ptr client_; std::unique_ptr server_; }; @@ -277,6 +283,7 @@ class ARROW_FLIGHT_EXPORT ErrorHandlingTest : public FlightTest { static_assert(std::is_base_of::value, \ ARROW_STRINGIFY(FIXTURE) " must inherit from ErrorHandlingTest"); \ TEST_F(FIXTURE, TestGetFlightInfo) { TestGetFlightInfo(); } \ + TEST_F(FIXTURE, TestGetFlightInfoMetadata) { TestGetFlightInfoMetadata(); } \ TEST_F(FIXTURE, TestDoPut) { TestDoPut(); } \ TEST_F(FIXTURE, TestDoExchange) { TestDoExchange(); } diff --git a/cpp/src/arrow/flight/transport/grpc/grpc_client.cc b/cpp/src/arrow/flight/transport/grpc/grpc_client.cc index 89f088638320e..9b40015f9f729 100644 --- a/cpp/src/arrow/flight/transport/grpc/grpc_client.cc +++ b/cpp/src/arrow/flight/transport/grpc/grpc_client.cc @@ -107,9 +107,9 @@ class GrpcClientInterceptorAdapter : public ::grpc::experimental::Interceptor { public: explicit GrpcClientInterceptorAdapter( std::vector> middleware) - : middleware_(std::move(middleware)), received_headers_(false) {} + : middleware_(std::move(middleware)) {} - void Intercept(::grpc::experimental::InterceptorBatchMethods* methods) { + void Intercept(::grpc::experimental::InterceptorBatchMethods* methods) override { using InterceptionHookPoints = ::grpc::experimental::InterceptionHookPoints; if (methods->QueryInterceptionHookPoint( InterceptionHookPoints::PRE_SEND_INITIAL_METADATA)) { @@ -142,10 +142,6 @@ class GrpcClientInterceptorAdapter : public ::grpc::experimental::Interceptor { private: void ReceivedHeaders( const std::multimap<::grpc::string_ref, ::grpc::string_ref>& metadata) { - if (received_headers_) { - return; - } - received_headers_ = true; CallHeaders headers; for (const auto& entry : metadata) { headers.insert({std::string_view(entry.first.data(), entry.first.length()), @@ -157,20 +153,14 @@ class GrpcClientInterceptorAdapter : public ::grpc::experimental::Interceptor { } std::vector> middleware_; - // When communicating with a gRPC-Java server, the server may not - // send back headers if the call fails right away. Instead, the - // headers will be consolidated into the trailers. We don't want to - // call the client middleware callback twice, so instead track - // whether we saw headers - if not, then we need to check trailers. - bool received_headers_; }; class GrpcClientInterceptorAdapterFactory : public ::grpc::experimental::ClientInterceptorFactoryInterface { public: - GrpcClientInterceptorAdapterFactory( + explicit GrpcClientInterceptorAdapterFactory( std::vector> middleware) - : middleware_(middleware) {} + : middleware_(std::move(middleware)) {} ::grpc::experimental::Interceptor* CreateClientInterceptor( ::grpc::experimental::ClientRpcInfo* info) override { diff --git a/cpp/src/arrow/flight/transport/grpc/grpc_server.cc b/cpp/src/arrow/flight/transport/grpc/grpc_server.cc index 2c7a1d5e99234..50d4ffe002c7e 100644 --- a/cpp/src/arrow/flight/transport/grpc/grpc_server.cc +++ b/cpp/src/arrow/flight/transport/grpc/grpc_server.cc @@ -111,6 +111,7 @@ class GrpcServerAuthSender : public ServerAuthSender { }; class GrpcServerCallContext : public ServerCallContext { + public: explicit GrpcServerCallContext(::grpc::ServerContext* context) : context_(context), peer_(context_->peer()) { for (const auto& entry : context->client_metadata()) { @@ -143,6 +144,14 @@ class GrpcServerCallContext : public ServerCallContext { return ToGrpcStatus(status, context_); } + void AddHeader(const std::string& key, const std::string& value) const override { + context_->AddInitialMetadata(key, value); + } + + void AddTrailer(const std::string& key, const std::string& value) const override { + context_->AddTrailingMetadata(key, value); + } + ServerMiddleware* GetMiddleware(const std::string& key) const override { const auto& instance = middleware_map_.find(key); if (instance == middleware_map_.end()) { diff --git a/cpp/src/arrow/flight/transport/ucx/flight_transport_ucx_test.cc b/cpp/src/arrow/flight/transport/ucx/flight_transport_ucx_test.cc index 3ac02bf7183a3..c3481d834f6ea 100644 --- a/cpp/src/arrow/flight/transport/ucx/flight_transport_ucx_test.cc +++ b/cpp/src/arrow/flight/transport/ucx/flight_transport_ucx_test.cc @@ -103,6 +103,8 @@ class UcxErrorHandlingTest : public ErrorHandlingTest, public ::testing::Test { std::string transport() const override { return "ucx"; } void SetUp() override { SetUpTest(); } void TearDown() override { TearDownTest(); } + + void TestGetFlightInfoMetadata() { GTEST_SKIP() << "Middleware not implemented"; } }; ARROW_FLIGHT_TEST_ERROR_HANDLING(UcxErrorHandlingTest); diff --git a/cpp/src/arrow/flight/transport/ucx/ucx_server.cc b/cpp/src/arrow/flight/transport/ucx/ucx_server.cc index 4a573d742929a..8bbac34705c23 100644 --- a/cpp/src/arrow/flight/transport/ucx/ucx_server.cc +++ b/cpp/src/arrow/flight/transport/ucx/ucx_server.cc @@ -72,6 +72,9 @@ class UcxServerCallContext : public flight::ServerCallContext { public: const std::string& peer_identity() const override { return peer_; } const std::string& peer() const override { return peer_; } + // Not supported + void AddHeader(const std::string& key, const std::string& value) const override {} + void AddTrailer(const std::string& key, const std::string& value) const override {} ServerMiddleware* GetMiddleware(const std::string& key) const override { return nullptr; } diff --git a/python/pyarrow/_flight.pyx b/python/pyarrow/_flight.pyx index c9f5526754e3d..0572ed77b40ef 100644 --- a/python/pyarrow/_flight.pyx +++ b/python/pyarrow/_flight.pyx @@ -1756,6 +1756,14 @@ cdef class ServerCallContext(_Weakrefable): """Check if the current RPC call has been canceled by the client.""" return self.context.is_cancelled() + def add_header(self, key, value): + """Add a response header.""" + self.context.AddHeader(tobytes(key), tobytes(value)) + + def add_trailer(self, key, value): + """Add a response trailer.""" + self.context.AddTrailer(tobytes(key), tobytes(value)) + def get_middleware(self, key): """ Get a middleware instance by key. diff --git a/python/pyarrow/includes/libarrow_flight.pxd b/python/pyarrow/includes/libarrow_flight.pxd index 34ba809438e2c..624904ed77a69 100644 --- a/python/pyarrow/includes/libarrow_flight.pxd +++ b/python/pyarrow/includes/libarrow_flight.pxd @@ -257,6 +257,8 @@ cdef extern from "arrow/flight/api.h" namespace "arrow" nogil: c_string& peer_identity() c_string& peer() c_bool is_cancelled() + void AddHeader(const c_string& key, const c_string& value) + void AddTrailer(const c_string& key, const c_string& value) CServerMiddleware* GetMiddleware(const c_string& key) cdef cppclass CTimeoutDuration" arrow::flight::TimeoutDuration": diff --git a/python/pyarrow/tests/test_flight.py b/python/pyarrow/tests/test_flight.py index 930523b9f5442..6c1c582dceb21 100644 --- a/python/pyarrow/tests/test_flight.py +++ b/python/pyarrow/tests/test_flight.py @@ -833,7 +833,7 @@ def sending_headers(self): def received_headers(self, headers): # Let the test code know what the last set of headers we # received were. - self.factory.last_headers = headers + self.factory.last_headers.update(headers) class MultiHeaderServerMiddlewareFactory(ServerMiddlewareFactory): @@ -2323,3 +2323,45 @@ def test_do_put_does_not_crash_when_schema_is_none(): with pytest.raises(TypeError, match=msg): client.do_put(flight.FlightDescriptor.for_command('foo'), schema=None) + + +def test_headers_trailers(): + """Ensure that server-sent headers/trailers make it through.""" + + class HeadersTrailersFlightServer(FlightServerBase): + def get_flight_info(self, context, descriptor): + context.add_header("x-header", "header-value") + context.add_header("x-header-bin", "header\x01value") + context.add_trailer("x-trailer", "trailer-value") + context.add_trailer("x-trailer-bin", "trailer\x01value") + return flight.FlightInfo( + pa.schema([]), + descriptor, + [], + -1, -1 + ) + + class HeadersTrailersMiddlewareFactory(ClientMiddlewareFactory): + def __init__(self): + self.headers = [] + + def start_call(self, info): + return HeadersTrailersMiddleware(self) + + class HeadersTrailersMiddleware(ClientMiddleware): + def __init__(self, factory): + self.factory = factory + + def received_headers(self, headers): + for key, values in headers.items(): + for value in values: + self.factory.headers.append((key, value)) + + factory = HeadersTrailersMiddlewareFactory() + with HeadersTrailersFlightServer() as server, \ + FlightClient(("localhost", server.port), middleware=[factory]) as client: + client.get_flight_info(flight.FlightDescriptor.for_path("")) + assert ("x-header", "header-value") in factory.headers + assert ("x-header-bin", b"header\x01value") in factory.headers + assert ("x-trailer", "trailer-value") in factory.headers + assert ("x-trailer-bin", b"trailer\x01value") in factory.headers From 112f94971882750731fabebd499ab0f817ca3839 Mon Sep 17 00:00:00 2001 From: Ashish Bailkeri <47304318+aboss123@users.noreply.github.com> Date: Mon, 31 Jul 2023 16:57:01 -0400 Subject: [PATCH 071/749] GH-36323: [Python] Fix Timestamp scalar repr error on values outside datetime range (#36942) ### Rationale for this change https://github.com/apache/arrow/issues/36323 ### What changes are included in this PR? Changed the way repr is handled for TimestampScalar ### Are these changes tested? I have added a very basic test for this change to see whether it will error or not if outside the range. ### Are there any user-facing changes? The functionality of TimestampScalar's repr now uses the `strftime` function. * Closes: #36323 Lead-authored-by: Ashish Bailkeri Co-authored-by: Ashish Bailkeri <47304318+aboss123@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- python/pyarrow/scalar.pxi | 17 +++++++++++++++++ python/pyarrow/tests/test_convert_builtin.py | 2 +- python/pyarrow/tests/test_scalars.py | 12 ++++++++++++ python/pyarrow/types.pxi | 4 ++-- 4 files changed, 32 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 74f5aa4213ca0..aff1c311abbfb 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -522,6 +522,23 @@ cdef class TimestampScalar(Scalar): return _datetime_from_int(sp.value, unit=dtype.unit(), tzinfo=tzinfo) + def __repr__(self): + """ + Return the representation of TimestampScalar using `strftime` to avoid + original repr datetime values being out of range. + """ + cdef: + CTimestampScalar* sp = self.wrapped.get() + CTimestampType* dtype = sp.type.get() + + if not dtype.timezone().empty(): + type_format = str(_pc().strftime(self, format="%Y-%m-%dT%H:%M:%S%z")) + else: + type_format = str(_pc().strftime(self)) + return ''.format( + self.__class__.__name__, type_format + ) + cdef class DurationScalar(Scalar): """ diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index af4c91a89459d..cf2535a3c62d1 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -1353,7 +1353,7 @@ def test_sequence_timestamp_from_int_with_unit(): assert len(arr_s) == 1 assert arr_s.type == s assert repr(arr_s[0]) == ( - "" + "" ) assert str(arr_s[0]) == "1970-01-01 00:00:01" diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index 2aaefe16ae469..a989301fe5735 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -154,6 +154,18 @@ def test_hashing_struct_scalar(): assert hash1 == hash2 +def test_timestamp_scalar(): + a = repr(pa.scalar("0000-01-01").cast(pa.timestamp("s"))) + assert a == "" + b = repr(pa.scalar(datetime.datetime(2015, 1, 1), type=pa.timestamp('s', tz='UTC'))) + assert b == "" + c = repr(pa.scalar(datetime.datetime(2015, 1, 1), type=pa.timestamp('us'))) + assert c == "" + d = repr(pc.assume_timezone( + pa.scalar("2000-01-01").cast(pa.timestamp("s")), "America/New_York")) + assert d == "" + + def test_bool(): false = pa.scalar(False) true = pa.scalar(True) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index fbd4f8a94b64c..12ad2fc4b6f60 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -3605,9 +3605,9 @@ def timestamp(unit, tz=None): >>> from datetime import datetime >>> pa.scalar(datetime(2012, 1, 1), type=pa.timestamp('s', tz='UTC')) - )> + >>> pa.scalar(datetime(2012, 1, 1), type=pa.timestamp('us')) - + Returns ------- From 334b46d349af897ba03d7d72be86d23aa5ee8b43 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Mon, 31 Jul 2023 22:14:27 -0700 Subject: [PATCH 072/749] GH-35409: [Python][Docs] Clarify S3FileSystem Credentials chain for EC2 (#35312) ### Rationale for this change When resolving AWS credentials on EC2 hosts, the underlying AWS SDK also looks at the EC2 Instance Metadata Service. I want to document this behavior for `pyarrow`. The [`s3fs` documentation](https://s3fs.readthedocs.io/en/latest/#credentials) mention this specific case for EC2. ### What changes are included in this PR? Documentation for the behavior described above. #### Technical Details `S3FileSystem` uses the [`CS3Options.Defaults()`](https://github.com/apache/arrow/blob/5de56928e0fe43f02005552eee058de57ffb2682/python/pyarrow/_s3fs.pyx#L317) option when no credentials are passed into the constructor. It utilizes the [`Aws::Auth::DefaultAWSCredentialsProviderChain`](https://github.com/apache/arrow/blob/1de159d0f6763766c19b183dd309b8757723b43a/cpp/src/arrow/filesystem/s3fs.cc#L213) The C++ implementation of [`DefaultAWSCredentialsProviderChain`](https://sdk.amazonaws.com/cpp/api/0.14.3/class_aws_1_1_auth_1_1_default_a_w_s_credentials_provider_chain.html) not only [reads the environment variable](https://sdk.amazonaws.com/cpp/api/0.14.3/class_aws_1_1_auth_1_1_environment_a_w_s_credentials_provider.html) when trying to resolve AWS credentials, but also [looks at profile config](https://sdk.amazonaws.com/cpp/api/0.14.3/class_aws_1_1_auth_1_1_profile_config_file_a_w_s_credentials_provider.html) and the [EC2 Instance Metadata Service](https://sdk.amazonaws.com/cpp/api/0.14.3/class_aws_1_1_auth_1_1_instance_profile_credentials_provider.html). ### Are these changes tested? No, just documentation changes ### Are there any user-facing changes? Yes, changing public documentation * Closes: #35409 ### Render Changes Render the changes locally via [Building the doc](https://arrow.apache.org/docs/developers/documentation.html#building-docs): `docs/source/python/filesystems.rst`: ![Screenshot 2023-07-30 at 6 22 02 PM](https://github.com/apache/arrow/assets/9057843/6af053a3-e7a7-4a68-a5b5-02c50e9290c6) `python/pyarrow/_s3fs.pyx`: ![Screenshot 2023-07-31 at 3 31 30 PM](https://github.com/apache/arrow/assets/9057843/d79768be-67ce-46c0-88ed-a833e540f77d) Lead-authored-by: Kevin Liu Co-authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- docs/source/python/filesystems.rst | 5 +++-- python/pyarrow/_s3fs.pyx | 12 +++++++++--- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/docs/source/python/filesystems.rst b/docs/source/python/filesystems.rst index 40656f6b76f43..3fc10dc7718d3 100644 --- a/docs/source/python/filesystems.rst +++ b/docs/source/python/filesystems.rst @@ -153,8 +153,9 @@ PyArrow implements natively a S3 filesystem for S3 compatible storage. The :class:`S3FileSystem` constructor has several options to configure the S3 connection (e.g. credentials, the region, an endpoint override, etc). In addition, the constructor will also inspect configured S3 credentials as -supported by AWS (for example the ``AWS_ACCESS_KEY_ID`` and -``AWS_SECRET_ACCESS_KEY`` environment variables). +supported by AWS (such as the ``AWS_ACCESS_KEY_ID`` and +``AWS_SECRET_ACCESS_KEY`` environment variables, AWS configuration files, +and EC2 Instance Metadata Service for EC2 nodes). Example how you can read contents from a S3 bucket:: diff --git a/python/pyarrow/_s3fs.pyx b/python/pyarrow/_s3fs.pyx index e76c7b9ffa730..51c248d147828 100644 --- a/python/pyarrow/_s3fs.pyx +++ b/python/pyarrow/_s3fs.pyx @@ -140,14 +140,20 @@ cdef class S3FileSystem(FileSystem): """ S3-backed FileSystem implementation - If neither access_key nor secret_key are provided, and role_arn is also not - provided, then attempts to initialize from AWS environment variables, - otherwise both access_key and secret_key must be provided. + AWS access_key and secret_key can be provided explicitly. If role_arn is provided instead of access_key and secret_key, temporary credentials will be fetched by issuing a request to STS to assume the specified role. + If neither access_key nor secret_key are provided, and role_arn is also not + provided, then attempts to establish the credentials automatically. + S3FileSystem will try the following methods, in order: + + * ``AWS_ACCESS_KEY_ID``, ``AWS_SECRET_ACCESS_KEY``, and ``AWS_SESSION_TOKEN`` environment variables + * configuration files such as ``~/.aws/credentials`` and ``~/.aws/config`` + * for nodes on Amazon EC2, the EC2 Instance Metadata Service + Note: S3 buckets are special and the operations available on them may be limited or more expensive than desired. From 8273f7abd44d4ed582650166de1c6ea59ed759b6 Mon Sep 17 00:00:00 2001 From: david dali susanibar arce Date: Tue, 1 Aug 2023 04:05:54 -0500 Subject: [PATCH 073/749] GH-36927: [Java][Docs] Enable Gandiva build as part of Java maven commands (#36929) ### Rationale for this change To close: https://github.com/apache/arrow/issues/36927 ### What changes are included in this PR? - Enable MacOS Gandiva build as part of Java maven commands - Enable Windows ORC build as part of Java maven commands ### Are these changes tested? Yes: - Gandiva: mvn generate-resources -Pgenerate-libs-jni-macos-linux -N - ORC: mvn generate-resources -Pgenerate-libs-jni-windows -N ### Are there any user-facing changes? No * Closes: #36927 Lead-authored-by: david dali susanibar arce Co-authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- docs/source/developers/java/building.rst | 17 +++++++++-------- java/pom.xml | 24 ++++++++++++++---------- 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/docs/source/developers/java/building.rst b/docs/source/developers/java/building.rst index 5dd3a979d8293..061c616d4b971 100644 --- a/docs/source/developers/java/building.rst +++ b/docs/source/developers/java/building.rst @@ -132,11 +132,7 @@ Maven $ cd arrow/java $ export JAVA_HOME= $ java --version - $ mvn generate-resources \ - -Pgenerate-libs-jni-macos-linux \ - -DARROW_GANDIVA=ON \ - -DARROW_JAVA_JNI_ENABLE_GANDIVA=ON \ - -N + $ mvn generate-resources -Pgenerate-libs-jni-macos-linux -N $ ls -latr java-dist/lib//*_{jni,java}.* |__ libarrow_dataset_jni.dylib |__ libarrow_orc_jni.dylib @@ -236,7 +232,9 @@ CMake -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_INSTALL_LIBDIR=lib/ \ -DCMAKE_INSTALL_PREFIX=java-dist \ - -DCMAKE_PREFIX_PATH=$PWD/java-dist + -DCMAKE_PREFIX_PATH=$PWD/java-dist \ + -DProtobuf_ROOT=$PWD/../cpp-jni/protobuf_ep-install \ + -DProtobuf_USE_STATIC_LIBS=ON $ cmake --build java-jni --target install --config Release $ ls -latr java-dist/lib//*_{jni,java}.* |__ libarrow_dataset_jni.dylib @@ -257,8 +255,9 @@ CMake -DARROW_DATASET=ON ^ -DARROW_DEPENDENCY_USE_SHARED=OFF ^ -DARROW_FILESYSTEM=ON ^ + -DARROW_GANDIVA=OFF ^ -DARROW_JSON=ON ^ - -DARROW_ORC=OFF ^ + -DARROW_ORC=ON ^ -DARROW_PARQUET=ON ^ -DARROW_S3=ON ^ -DARROW_SUBSTRAIT=ON ^ @@ -280,9 +279,10 @@ CMake -S java ^ -B java-jni ^ -DARROW_JAVA_JNI_ENABLE_C=OFF ^ + -DARROW_JAVA_JNI_ENABLE_DATASET=ON ^ -DARROW_JAVA_JNI_ENABLE_DEFAULT=ON ^ -DARROW_JAVA_JNI_ENABLE_GANDIVA=OFF ^ - -DARROW_JAVA_JNI_ENABLE_ORC=OFF ^ + -DARROW_JAVA_JNI_ENABLE_ORC=ON ^ -DBUILD_TESTING=OFF ^ -DCMAKE_BUILD_TYPE=Release ^ -DCMAKE_INSTALL_LIBDIR=lib/x86_64 ^ @@ -290,6 +290,7 @@ CMake -DCMAKE_PREFIX_PATH=$PWD/java-dist $ cmake --build java-jni --target install --config Release $ dir "java-dist/bin" + |__ arrow_orc_jni.dll |__ arrow_dataset_jni.dll Archery diff --git a/java/pom.xml b/java/pom.xml index d6b37c7df969d..f6837cd82c7b5 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -984,13 +984,13 @@ java-dist false - ON + ON + ON ON ON - OFF OFF ON - OFF + ON ON @@ -1012,8 +1012,8 @@ -S cpp -B cpp-jni -DARROW_BUILD_SHARED=OFF - -DARROW_CSV=${ARROW_CSV} - -DARROW_DATASET=ON + -DARROW_CSV=${ARROW_DATASET} + -DARROW_DATASET=${ARROW_DATASET} -DARROW_DEPENDENCY_SOURCE=BUNDLED -DARROW_DEPENDENCY_USE_SHARED=OFF -DARROW_FILESYSTEM=ON @@ -1070,6 +1070,8 @@ -DCMAKE_INSTALL_LIBDIR=lib/${os.detected.arch} -DCMAKE_INSTALL_PREFIX=${arrow.dataset.jni.dist.dir} -DCMAKE_PREFIX_PATH=${project.basedir}/../java-dist/lib/${os.detected.arch}/cmake + -DProtobuf_USE_STATIC_LIBS=ON + -DProtobuf_ROOT=${project.basedir}/../cpp-jni/protobuf_ep-install ../ @@ -1099,13 +1101,14 @@ java-dist false - ON - OFF + ON + OFF + ON ON OFF ON OFF - OFF + ON @@ -1126,10 +1129,11 @@ -S cpp -B cpp-jni -DARROW_BUILD_SHARED=OFF - -DARROW_CSV=${ARROW_CSV} - -DARROW_DATASET=ON + -DARROW_CSV=${ARROW_DATASET} + -DARROW_DATASET=${ARROW_DATASET} -DARROW_DEPENDENCY_USE_SHARED=OFF -DARROW_FILESYSTEM=ON + -DARROW_GANDIVA=${ARROW_GANDIVA} -DARROW_JSON=${ARROW_DATASET} -DARROW_ORC=${ARROW_ORC} -DARROW_PARQUET=${ARROW_PARQUET} From 66a6136c02ad471585a5c38589ba6d779ad28d16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Tue, 1 Aug 2023 14:27:25 +0200 Subject: [PATCH 074/749] GH-36947: [CI] Move free up disk space to the Jinja macros to be able to reuse it on docs job (#36948) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Try to get rid of some failures on docs generation on release and reuse existing code. ### What changes are included in this PR? Move step to a macro to be able to reuse it ### Are these changes tested? Archery tasks ### Are there any user-facing changes? No * Closes: #36947 Authored-by: Raúl Cumplido Signed-off-by: Raúl Cumplido --- dev/tasks/docs/github.linux.yml | 1 + dev/tasks/linux-packages/github.linux.yml | 50 +-------------------- dev/tasks/macros.jinja | 54 +++++++++++++++++++++++ 3 files changed, 56 insertions(+), 49 deletions(-) diff --git a/dev/tasks/docs/github.linux.yml b/dev/tasks/docs/github.linux.yml index 0938ac74adc46..6de297b663e01 100644 --- a/dev/tasks/docs/github.linux.yml +++ b/dev/tasks/docs/github.linux.yml @@ -26,6 +26,7 @@ jobs: {{ macros.github_set_env(env) }} steps: {{ macros.github_checkout_arrow(fetch_depth=fetch_depth|default(1))|indent }} + {{ macros.github_free_space()|indent }} {{ macros.github_install_archery()|indent }} - name: Execute Docker Build diff --git a/dev/tasks/linux-packages/github.linux.yml b/dev/tasks/linux-packages/github.linux.yml index bf28cf10e9ecf..6de3edfce07e1 100644 --- a/dev/tasks/linux-packages/github.linux.yml +++ b/dev/tasks/linux-packages/github.linux.yml @@ -32,56 +32,8 @@ jobs: steps: {{ macros.github_checkout_arrow()|indent }} {{ macros.github_login_dockerhub()|indent }} + {{ macros.github_free_space()|indent }} - - name: Free up disk space - if: | - env.ARCHITECTURE == 'amd64' - run: | - df -h - echo "::group::/usr/local/*" - du -hsc /usr/local/* - echo "::endgroup::" - echo "::group::/usr/local/bin/*" - du -hsc /usr/local/bin/* - echo "::endgroup::" - # ~1GB (From 1.2GB to 214MB) - sudo rm -rf \ - /usr/local/bin/aliyun \ - /usr/local/bin/azcopy \ - /usr/local/bin/bicep \ - /usr/local/bin/cmake-gui \ - /usr/local/bin/cpack \ - /usr/local/bin/helm \ - /usr/local/bin/hub \ - /usr/local/bin/kubectl \ - /usr/local/bin/minikube \ - /usr/local/bin/node \ - /usr/local/bin/packer \ - /usr/local/bin/pulumi* \ - /usr/local/bin/stack \ - /usr/local/bin/terraform || : - echo "::group::/usr/local/share/*" - du -hsc /usr/local/share/* - echo "::endgroup::" - # 1.3GB - sudo rm -rf /usr/local/share/powershell || : - echo "::group::/opt/*" - du -hsc /opt/* - echo "::endgroup::" - echo "::group::/opt/hostedtoolcache/*" - du -hsc /opt/hostedtoolcache/* - echo "::endgroup::" - # 5.3GB - sudo rm -rf /opt/hostedtoolcache/CodeQL || : - # 1.4GB - sudo rm -rf /opt/hostedtoolcache/go || : - # 489MB - sudo rm -rf /opt/hostedtoolcache/PyPy || : - # 1.2GB - sudo rm -rf /opt/hostedtoolcache/Python || : - # 376MB - sudo rm -rf /opt/hostedtoolcache/node || : - df -h - name: Set up Ruby run: | sudo apt update diff --git a/dev/tasks/macros.jinja b/dev/tasks/macros.jinja index 1f2f9a72975eb..5948b663d915b 100644 --- a/dev/tasks/macros.jinja +++ b/dev/tasks/macros.jinja @@ -72,6 +72,60 @@ on: run: pip install -e arrow/dev/archery[all] {% endmacro %} +{%- macro github_free_space() -%} + - name: Free up disk space + if: runner.os == 'Linux' && runner.arch == 'X64' + shell: bash + run: | + df -h + echo "::group::/usr/local/*" + du -hsc /usr/local/* + echo "::endgroup::" + echo "::group::/usr/local/bin/*" + du -hsc /usr/local/bin/* + echo "::endgroup::" + # ~1GB (From 1.2GB to 214MB) + sudo rm -rf \ + /usr/local/bin/aliyun \ + /usr/local/bin/azcopy \ + /usr/local/bin/bicep \ + /usr/local/bin/cmake-gui \ + /usr/local/bin/cpack \ + /usr/local/bin/helm \ + /usr/local/bin/hub \ + /usr/local/bin/kubectl \ + /usr/local/bin/minikube \ + /usr/local/bin/node \ + /usr/local/bin/packer \ + /usr/local/bin/pulumi* \ + /usr/local/bin/stack \ + /usr/local/bin/terraform || : + echo "::group::/usr/local/share/*" + du -hsc /usr/local/share/* + echo "::endgroup::" + # 1.3GB + sudo rm -rf /usr/local/share/powershell || : + echo "::group::/opt/*" + du -hsc /opt/* + echo "::endgroup::" + echo "::group::/opt/hostedtoolcache/*" + du -hsc /opt/hostedtoolcache/* + echo "::endgroup::" + # 5.3GB + sudo rm -rf /opt/hostedtoolcache/CodeQL || : + # 1.4GB + sudo rm -rf /opt/hostedtoolcache/go || : + # 489MB + sudo rm -rf /opt/hostedtoolcache/PyPy || : + # 376MB + sudo rm -rf /opt/hostedtoolcache/node || : + # Remove Web browser packages + sudo apt-get purge -y -f firefox \ + google-chrome-stable \ + microsoft-edge-stable + df -h +{% endmacro %} + {%- macro github_upload_releases(pattern) -%} - name: Set up Python by actions/setup-python if: runner.arch == 'X64' From 943bf4887648bd14a7acc071497a4e4c39788bbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Tue, 1 Aug 2023 16:44:10 +0200 Subject: [PATCH 075/749] GH-36692: [CI][Packaging] Pin gemfury to 0.12.0 due to issue with faraday dependency (#36693) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Some nightly wheel jobs have failed to upload. ### What changes are included in this PR? Install required gem dependency. ### Are these changes tested? Yes, crossbow tasks ### Are there any user-facing changes? No * Closes: #36692 Lead-authored-by: Raúl Cumplido Co-authored-by: Sutou Kouhei Signed-off-by: Raúl Cumplido --- dev/tasks/macros.jinja | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/dev/tasks/macros.jinja b/dev/tasks/macros.jinja index 5948b663d915b..757c15c937ce5 100644 --- a/dev/tasks/macros.jinja +++ b/dev/tasks/macros.jinja @@ -191,11 +191,25 @@ on: run: | sudo apt update sudo apt install -y ruby-full + - name: Set up Ruby by GitHub Actions + if: runner.arch == 'X64' && runner.os != 'macOS' + uses: ruby/setup-ruby@v1 + with: + ruby-version: "ruby" + - name: Install gemfury client on ARM self-hosted + if: runner.arch != 'X64' + run: | + # GH-36692: Pin gemfury due to wrong faraday dependency declaration. + gem install --user-install gemfury -v 0.12.0 + ruby -r rubygems -e 'puts("#{Gem.user_dir}/bin")' >> $GITHUB_PATH + - name: Install gemfury client + if: runner.arch == 'X64' + run: | + # GH-36692: Pin gemfury due to wrong faraday dependency declaration. + gem install gemfury -v 0.12.0 - name: Upload package to Gemfury shell: bash run: | - PATH=$(echo $(ruby -r rubygems -e 'puts Gem.user_dir') | sed "s/C:\//\/c\//")/bin:$PATH - gem install --user-install gemfury fury push \ --api-token=${CROSSBOW_GEMFURY_TOKEN} \ --as=${CROSSBOW_GEMFURY_ORG} \ From 5bb53c7ffc9c7ca21d5d415bf8fde4d6b0034b47 Mon Sep 17 00:00:00 2001 From: Dane Pitkin <48041712+danepitkin@users.noreply.github.com> Date: Tue, 1 Aug 2023 11:36:15 -0400 Subject: [PATCH 076/749] GH-36973: [CI][Python] Archery linter integrated with flake8==6.1.0 (#36976) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Fix archery lint issues with new version of flake8 dependency. ### What changes are included in this PR? * Lint errors. ### Are these changes tested? Yes. ``` $ flake8 --version 6.1.0 (mccabe: 0.7.0, pycodestyle: 2.11.0, pyflakes: 3.1.0) CPython 3.11.4 on Darwin $ archery lint --python INFO:archery:Running Python formatter (autopep8) INFO:archery:Running Python linter (flake8) INFO:archery:Running Cython linter (cython-lint) $ ``` ### Are there any user-facing changes? No * Closes: #36973 Authored-by: Dane Pitkin Signed-off-by: Raúl Cumplido --- .pre-commit-config.yaml | 2 +- dev/archery/setup.py | 5 ++--- python/pyarrow/tests/test_array.py | 2 +- python/pyarrow/tests/test_csv.py | 2 +- python/pyarrow/tests/test_dataset.py | 8 ++++---- python/pyarrow/tests/test_extension_type.py | 16 ++++++++-------- python/pyarrow/tests/test_flight.py | 2 +- python/pyarrow/tests/test_table.py | 4 ++-- 8 files changed, 20 insertions(+), 21 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e95778ce1cbfd..d3c7624f63e71 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -37,7 +37,7 @@ repos: entry: --entrypoint /bin/hadolint hadolint/hadolint:latest - exclude: ^dev/.*$ - repo: https://github.com/pycqa/flake8 - rev: 5.0.3 + rev: 6.1.0 hooks: - id: flake8 name: Python Format diff --git a/dev/archery/setup.py b/dev/archery/setup.py index 7dbfe47d6eeb5..627e576fb6f59 100755 --- a/dev/archery/setup.py +++ b/dev/archery/setup.py @@ -28,9 +28,8 @@ jinja_req = 'jinja2>=2.11' extras = { - 'lint': [ - 'numpydoc==1.1.0', 'autopep8', 'flake8', 'cython-lint', 'cmake_format==0.6.13' - ], + 'lint': ['numpydoc==1.1.0', 'autopep8', 'flake8==6.1.0', 'cython-lint', + 'cmake_format==0.6.13'], 'benchmark': ['pandas'], 'docker': ['ruamel.yaml', 'python-dotenv'], 'release': ['pygithub', jinja_req, 'jira', 'semver', 'gitpython'], diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 77eaca3dd1d54..0546830a661de 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -301,7 +301,7 @@ def test_asarray(): np_arr = np.asarray([_ for _ in arr]) assert np_arr.tolist() == [0, 1, 2, 3] assert np_arr.dtype == np.dtype('O') - assert type(np_arr[0]) == pa.lib.Int64Value + assert isinstance(np_arr[0], pa.lib.Int64Value) # Calling with the arrow array gives back an array with 'int64' dtype np_arr = np.asarray(arr) diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py index e92afce035275..81c31d98ac7ec 100644 --- a/python/pyarrow/tests/test_csv.py +++ b/python/pyarrow/tests/test_csv.py @@ -1936,7 +1936,7 @@ def test_write_quoting_style(): except Exception as e: # This will trigger when we try to write a comma (,) # without quotes, which is invalid - assert type(e) == res + assert isinstance(e, res) break assert buf.getvalue() == res buf.seek(0) diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index a70cf2fbc72af..f92317c0f223e 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -5021,8 +5021,8 @@ def test_dataset_filter(tempdir, dstype): # Ensure chained filtering works. result = ds1.filter(pc.field("colA") < 3).filter(pc.field("col2") == "a") - assert type(result) == (ds.FileSystemDataset if dstype == - "fs" else ds.InMemoryDataset) + expected = ds.FileSystemDataset if dstype == "fs" else ds.InMemoryDataset + assert isinstance(result, expected) assert result.to_table() == pa.table({ "colA": [1], @@ -5181,9 +5181,9 @@ def test_read_table_nested_columns(tempdir, format): "a.dotted.field": [1, 2], "interaction": [ {"type": None, "element": "button", - "values": [1, 2], "structs":[{"foo": "bar"}, None]}, + "values": [1, 2], "structs": [{"foo": "bar"}, None]}, {"type": "scroll", "element": "window", - "values": [None, 3, 4], "structs":[{"fizz": "buzz"}]} + "values": [None, 3, 4], "structs": [{"fizz": "buzz"}]} ]}) ds.write_dataset(table, tempdir / "table", format=format) ds1 = ds.dataset(tempdir / "table", format=format) diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 009896fd67e40..973aa29c7583f 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -340,8 +340,8 @@ def test_ext_scalar_from_array(): assert len(scalars_a) == 4 assert ty1.__arrow_ext_scalar_class__() == UuidScalarType - assert type(a[0]) == UuidScalarType - assert type(scalars_a[0]) == UuidScalarType + assert isinstance(a[0], UuidScalarType) + assert isinstance(scalars_a[0], UuidScalarType) for s, val in zip(scalars_a, data): assert isinstance(s, pa.ExtensionScalar) @@ -737,7 +737,7 @@ def __arrow_ext_deserialize__(cls, storage_type, serialized): def __eq__(self, other): if isinstance(other, pa.BaseExtensionType): - return (type(self) == type(other) and + return (isinstance(self, type(other)) and self.freq == other.freq) else: return NotImplemented @@ -799,7 +799,7 @@ def test_generic_ext_type_ipc(registered_period_type): arr = pa.ExtensionArray.from_storage(period_type, storage) batch = pa.RecordBatch.from_arrays([arr], ["ext"]) # check the built array has exactly the expected clss - assert type(arr) == period_class + assert isinstance(arr, period_class) buf = ipc_write_batch(batch) del batch @@ -807,7 +807,7 @@ def test_generic_ext_type_ipc(registered_period_type): result = batch.column(0) # check the deserialized array class is the expected one - assert type(result) == period_class + assert isinstance(result, period_class) assert result.type.extension_name == "test.period" assert arr.storage.to_pylist() == [1, 2, 3, 4] @@ -830,7 +830,7 @@ def test_generic_ext_type_ipc(registered_period_type): result = batch.column(0) assert isinstance(result.type, PeriodType) assert result.type.freq == 'H' - assert type(result) == period_class + assert isinstance(result, period_class) def test_generic_ext_type_ipc_unknown(registered_period_type): @@ -1261,7 +1261,7 @@ def test_tensor_type_ipc(tensor_type): # check the built array has exactly the expected clss tensor_class = tensor_type.__arrow_ext_class__() - assert type(arr) == tensor_class + assert isinstance(arr, tensor_class) buf = ipc_write_batch(batch) del batch @@ -1269,7 +1269,7 @@ def test_tensor_type_ipc(tensor_type): result = batch.column(0) # check the deserialized array class is the expected one - assert type(result) == tensor_class + assert isinstance(result, tensor_class) assert result.type.extension_name == "arrow.fixed_shape_tensor" assert arr.storage.to_pylist() == [[1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6]] diff --git a/python/pyarrow/tests/test_flight.py b/python/pyarrow/tests/test_flight.py index 6c1c582dceb21..bf15ad0bc4d65 100644 --- a/python/pyarrow/tests/test_flight.py +++ b/python/pyarrow/tests/test_flight.py @@ -1495,7 +1495,7 @@ def test_tls_override_hostname(): """Check that incorrectly overriding the hostname fails.""" certs = example_tls_certs() - with ConstantFlightServer(tls_certificates=certs["certificates"]) as s,\ + with ConstantFlightServer(tls_certificates=certs["certificates"]) as s, \ flight.connect(('localhost', s.port), tls_root_certs=certs["root_cert"], override_hostname="fakehostname") as client: diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index 61cfb1af587a7..e28256e91f0a6 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -511,7 +511,7 @@ def test_recordbatch_basics(): ('c0', [0, 1, 2, 3, 4]), ('c1', [-10, -5, 0, None, 10]) ]) - assert type(pydict) == dict + assert isinstance(pydict, dict) with pytest.raises(IndexError): # bounds checking @@ -949,7 +949,7 @@ def test_table_basics(): ('a', [0, 1, 2, 3, 4]), ('b', [-10, -5, 0, 5, 10]) ]) - assert type(pydict) == dict + assert isinstance(pydict, dict) columns = [] for col in table.itercolumns(): From 0fb744cb6029a9b7c8faa38205d73f94118ec7c5 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Tue, 1 Aug 2023 11:57:02 -0400 Subject: [PATCH 077/749] GH-36935: [Go] Fix Timestamp to Time dates (#36964) ### Rationale for this change The previous solution converted everything to nanoseconds first but you end up with overflowing `int64` potentially. Since we've bumped the minimum version of the library to using go1.17+ we can use the newer `UnixMicro` and `UnixMilli` functions to make this easy. ### Are these changes tested? Yes, unit test is added. * Closes: #36935 Authored-by: Matt Topol Signed-off-by: Matt Topol --- go/arrow/datatype_fixedwidth.go | 12 ++++++++++-- go/arrow/datatype_fixedwidth_test.go | 7 +++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/go/arrow/datatype_fixedwidth.go b/go/arrow/datatype_fixedwidth.go index d6550c1cf896d..4b6ca55291537 100644 --- a/go/arrow/datatype_fixedwidth.go +++ b/go/arrow/datatype_fixedwidth.go @@ -192,10 +192,16 @@ func TimestampFromString(val string, unit TimeUnit) (Timestamp, error) { } func (t Timestamp) ToTime(unit TimeUnit) time.Time { - if unit == Second { + switch unit { + case Second: return time.Unix(int64(t), 0).UTC() + case Millisecond: + return time.UnixMilli(int64(t)).UTC() + case Microsecond: + return time.UnixMicro(int64(t)).UTC() + default: + return time.Unix(0, int64(t)).UTC() } - return time.Unix(0, int64(t)*int64(unit.Multiplier())).UTC() } // TimestampFromTime allows converting time.Time to Timestamp @@ -327,6 +333,8 @@ const ( var TimeUnitValues = []TimeUnit{Second, Millisecond, Microsecond, Nanosecond} +// Multiplier returns a time.Duration value to multiply by in order to +// convert the value into nanoseconds func (u TimeUnit) Multiplier() time.Duration { return [...]time.Duration{time.Second, time.Millisecond, time.Microsecond, time.Nanosecond}[uint(u)&3] } diff --git a/go/arrow/datatype_fixedwidth_test.go b/go/arrow/datatype_fixedwidth_test.go index 50747366a255e..669c7f9ca87ad 100644 --- a/go/arrow/datatype_fixedwidth_test.go +++ b/go/arrow/datatype_fixedwidth_test.go @@ -159,6 +159,13 @@ func TestTimestampType(t *testing.T) { } } +func TestTimestampToTime(t *testing.T) { + ts := arrow.Timestamp(11865225600000) + tm := ts.ToTime(arrow.Millisecond) + + assert.Equal(t, "2345-12-30 00:00:00", tm.Format("2006-01-02 15:04:05.999")) +} + func TestTime32Type(t *testing.T) { for _, tc := range []struct { unit arrow.TimeUnit From f44e28fa03a64ae5b3d9352d21aee2cc84f9af6c Mon Sep 17 00:00:00 2001 From: Thor <8681572+thorfour@users.noreply.github.com> Date: Tue, 1 Aug 2023 12:00:13 -0500 Subject: [PATCH 078/749] GH-36981: [Go] Fix ipc reader leak (#36982) ### Rationale for this change Previously the ipc reader was leaking allocations. ### What changes are included in this PR? Call `Clear()` on the memo table on final release of the ipc reader. ### Are these changes tested? Yes ### Are there any user-facing changes? * Closes: #36981 Authored-by: thorfour Signed-off-by: Matt Topol --- go/arrow/ipc/reader.go | 1 + go/arrow/ipc/reader_test.go | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/go/arrow/ipc/reader.go b/go/arrow/ipc/reader.go index 99aab597ce950..bee48cf965682 100644 --- a/go/arrow/ipc/reader.go +++ b/go/arrow/ipc/reader.go @@ -159,6 +159,7 @@ func (r *Reader) Release() { r.r.Release() r.r = nil } + r.memo.Clear() } } diff --git a/go/arrow/ipc/reader_test.go b/go/arrow/ipc/reader_test.go index a8930984fbf37..7bcf737af0d6d 100644 --- a/go/arrow/ipc/reader_test.go +++ b/go/arrow/ipc/reader_test.go @@ -56,3 +56,40 @@ func TestReaderCatchPanic(t *testing.T) { assert.Contains(t, err.Error(), "arrow/ipc: unknown error while reading") } } + +func TestReaderCheckedAllocator(t *testing.T) { + alloc := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer alloc.AssertSize(t, 0) + schema := arrow.NewSchema([]arrow.Field{ + { + Name: "s", + Type: &arrow.DictionaryType{ + ValueType: arrow.BinaryTypes.String, + IndexType: arrow.PrimitiveTypes.Int32, + }, + }, + }, nil) + + b := array.NewRecordBuilder(alloc, schema) + defer b.Release() + + bldr := b.Field(0).(*array.BinaryDictionaryBuilder) + bldr.Append([]byte("foo")) + bldr.Append([]byte("bar")) + bldr.Append([]byte("baz")) + + rec := b.NewRecord() + defer rec.Release() + + buf := new(bytes.Buffer) + writer := NewWriter(buf, WithSchema(schema), WithAllocator(alloc)) + defer writer.Close() + require.NoError(t, writer.Write(rec)) + + reader, err := NewReader(buf, WithAllocator(alloc)) + require.NoError(t, err) + defer reader.Release() + + _, err = reader.Read() + require.NoError(t, err) +} From e34d4c93761d4761f0df2626114927c80a81a87f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Wed, 2 Aug 2023 00:20:00 +0200 Subject: [PATCH 079/749] GH-36199: [Python][CI][Spark] Update spark versions used on our nightly tests (#36347) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change We are currently testing with very old or deprecated spark versions. ### What changes are included in this PR? Update the spark versions to be tested. ### Are these changes tested? They will via archery ### Are there any user-facing changes? No * Closes: #36199 Authored-by: Raúl Cumplido Signed-off-by: Jacob Wujciak-Jens --- dev/tasks/tasks.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 25bda38809e3b..73b793162d959 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1588,9 +1588,9 @@ tasks: image: conda-python-hdfs {% endfor %} -{% for python_version, spark_version, test_pyarrow_only, numpy_version in [("3.8", "v3.1.2", "false", "latest"), - ("3.9", "v3.2.0", "false", "1.23"), - ("3.10", "master", "false", "latest")] %} +{% for python_version, spark_version, test_pyarrow_only, numpy_version in [("3.8", "v3.4.1", "false", "latest"), + ("3.10", "v3.4.1", "false", "1.23"), + ("3.11", "master", "false", "latest")] %} test-conda-python-{{ python_version }}-spark-{{ spark_version }}: ci: github template: docker-tests/github.linux.yml From 7325a0160acd0f4c708d5abfd34cc8eb6df6c8eb Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 2 Aug 2023 00:25:28 +0200 Subject: [PATCH 080/749] MINOR: [C#] Bump Grpc.Tools from 2.42.0 to 2.56.2 in /csharp (#36727) Bumps [Grpc.Tools](https://github.com/grpc/grpc) from 2.42.0 to 2.56.2.
Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=Grpc.Tools&package-manager=nuget&previous-version=2.42.0&new-version=2.56.2)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Jacob Wujciak-Jens --- csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj index d9956ed49cc44..0bcd6c87b0d06 100644 --- a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj +++ b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj @@ -7,7 +7,7 @@ - + From b8e450347ab111a4f8b2fe5134c59235bcd50b20 Mon Sep 17 00:00:00 2001 From: mwish Date: Wed, 2 Aug 2023 06:27:12 +0800 Subject: [PATCH 081/749] MINOR: [C++] Acero tiny typo fix (#36938) ### Rationale for this change 1. Some typo fix in acero 2. move `std::shared_ptr` rather than copy ### What changes are included in this PR? 1. Some typo fix in acero 2. move `std::shared_ptr` rather than copy ### Are these changes tested? no ### Are there any user-facing changes? no Authored-by: mwish Signed-off-by: Jacob Wujciak-Jens --- cpp/src/arrow/acero/exec_plan.h | 2 +- cpp/src/arrow/acero/options.h | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/acero/exec_plan.h b/cpp/src/arrow/acero/exec_plan.h index 04303aa9512b1..dba6c64ddc837 100644 --- a/cpp/src/arrow/acero/exec_plan.h +++ b/cpp/src/arrow/acero/exec_plan.h @@ -739,7 +739,7 @@ DeclarationToBatchesAsync(Declaration declaration, ExecContext exec_context); /// \brief Utility method to run a declaration and return results as a RecordBatchReader /// /// If an exec context is not provided then a default exec context will be used based -/// on the value of `use_threads`. If `use_threads` is false then the CPU exeuctor will +/// on the value of `use_threads`. If `use_threads` is false then the CPU executor will /// be a serial executor and all CPU work will be done on the calling thread. I/O tasks /// will still happen on the I/O executor and may be multi-threaded. /// diff --git a/cpp/src/arrow/acero/options.h b/cpp/src/arrow/acero/options.h index bb94bdaa4a628..1ede3fbfc8ed0 100644 --- a/cpp/src/arrow/acero/options.h +++ b/cpp/src/arrow/acero/options.h @@ -80,7 +80,7 @@ class ARROW_ACERO_EXPORT ExecNodeOptions { /// /// For each batch received a new task will be created to push that batch downstream. /// This task will slice smaller units of size `ExecPlan::kMaxBatchSize` from the -/// parent batch and call InputRecieved. Thus, if the `generator` yields a large +/// parent batch and call InputReceived. Thus, if the `generator` yields a large /// batch it may result in several calls to InputReceived. /// /// The SourceNode will, by default, assign an implicit ordering to outgoing batches. @@ -115,7 +115,7 @@ class ARROW_ACERO_EXPORT TableSourceNodeOptions : public ExecNodeOptions { /// Create an instance from values TableSourceNodeOptions(std::shared_ptr
table, int64_t max_batch_size = kDefaultMaxBatchSize) - : table(table), max_batch_size(max_batch_size) {} + : table(std::move(table)), max_batch_size(max_batch_size) {} /// \brief a table which acts as the data source std::shared_ptr
table; @@ -135,7 +135,7 @@ class ARROW_ACERO_EXPORT NamedTableNodeOptions : public ExecNodeOptions { public: /// Create an instance from values NamedTableNodeOptions(std::vector names, std::shared_ptr schema) - : names(std::move(names)), schema(schema) {} + : names(std::move(names)), schema(std::move(schema)) {} /// \brief the names to put in the serialized plan std::vector names; @@ -156,7 +156,7 @@ class ARROW_ACERO_EXPORT SchemaSourceNodeOptions : public ExecNodeOptions { /// Create an instance that will create a new task on io_executor for each iteration SchemaSourceNodeOptions(std::shared_ptr schema, ItMaker it_maker, arrow::internal::Executor* io_executor) - : schema(schema), + : schema(std::move(schema)), it_maker(std::move(it_maker)), io_executor(io_executor), requires_io(true) {} @@ -165,7 +165,7 @@ class ARROW_ACERO_EXPORT SchemaSourceNodeOptions : public ExecNodeOptions { /// executor SchemaSourceNodeOptions(std::shared_ptr schema, ItMaker it_maker, bool requires_io = false) - : schema(schema), + : schema(std::move(schema)), it_maker(std::move(it_maker)), io_executor(NULLPTR), requires_io(requires_io) {} From a046b882676c7b7826ac7b09f200622c80952cba Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 2 Aug 2023 08:23:39 +0900 Subject: [PATCH 082/749] MINOR: [C#] Bump Grpc.Net.Client from 2.52.0 to 2.55.0 in /csharp (#36729) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [Grpc.Net.Client](https://github.com/grpc/grpc-dotnet) from 2.52.0 to 2.55.0.
Release notes

Sourced from Grpc.Net.Client's releases.

Release v2.55.0

What's Changed

Full Changelog: https://github.com/grpc/grpc-dotnet/compare/v2.55.0-pre1...v2.55.0

Release v2.55.0-pre1

What's Changed

Full Changelog: https://github.com/grpc/grpc-dotnet/compare/v2.54.0...v2.55.0-pre1

Release v2.54.0

What's Changed

Full Changelog: https://github.com/grpc/grpc-dotnet/compare/v2.54.0-pre1...v2.54.0

Release v2.54.0-pre1

What's Changed

New Contributors

Full Changelog: https://github.com/grpc/grpc-dotnet/compare/v2.53.0...v2.54.0-pre1

Release v2.53.0

What's Changed

... (truncated)

Commits
  • 8ca08eb Upgrade version to v2.55.0 (on v2.55.x branch) (#2184)
  • 76824ea Update 2.55 branch to 2.55-pre1 (#2176)
  • c72ada6 Update Grpc.Tools to 2.55 (#2171)
  • 2be676a WriteAsync cancellation throws an error with the calls completed status if po...
  • 73c726b Fix client factory extension method validation issues (#2159)
  • 697f349 Refactor healthchecks service mapping to support filtering on check (#2142)
  • ff1a07b Update Google.Protobuf to 3.23.1 (#2136)
  • 6bc44e9 Fix flaky streaming test (#2134)
  • f9a00bc Fix capturing ExecutionContext by timers and background tasks (#2129)
  • 2dc971e Add IReadOnlyDictionary to BalancerAttributes collection (#2121)
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=Grpc.Net.Client&package-manager=nuget&previous-version=2.52.0&new-version=2.55.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj index 0bcd6c87b0d06..ed33d88861415 100644 --- a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj +++ b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj @@ -6,7 +6,7 @@ - + From 0edef8c90797ba5dcadd0daf5009dc2d91ecd099 Mon Sep 17 00:00:00 2001 From: David Li Date: Tue, 1 Aug 2023 19:39:39 -0400 Subject: [PATCH 083/749] GH-36975: [C++][FlightRPC] Skip unknown fields, don't crash (#36979) ### Rationale for this change We should skip unknown fields instead of crashing, for forwards compatibility. ### What changes are included in this PR? Skip unknown fields in the FlightData deserializer. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * Closes: #36975 Authored-by: David Li Signed-off-by: Sutou Kouhei --- cpp/src/arrow/flight/flight_internals_test.cc | 35 +++++++++++++++++++ .../transport/grpc/serialization_internal.cc | 34 ++++++++++-------- 2 files changed, 54 insertions(+), 15 deletions(-) diff --git a/cpp/src/arrow/flight/flight_internals_test.cc b/cpp/src/arrow/flight/flight_internals_test.cc index 27c13ff949836..e56bab6db2092 100644 --- a/cpp/src/arrow/flight/flight_internals_test.cc +++ b/cpp/src/arrow/flight/flight_internals_test.cc @@ -34,6 +34,9 @@ #include "arrow/testing/gtest_util.h" #include "arrow/util/string.h" +// Include after Flight headers +#include + namespace arrow { namespace flight { @@ -651,6 +654,38 @@ TEST_F(TestCookieParsing, CookieCache) { AddCookieVerifyCache({"id0=0;", "id1=1;", "id2=2"}, "id0=0; id1=1; id2=2"); } +// ---------------------------------------------------------------------- +// Protobuf tests + +TEST(GrpcTransport, FlightDataDeserialize) { +#ifndef _WIN32 + pb::FlightData raw; + // Tack on known and unknown fields by hand here + raw.GetReflection()->MutableUnknownFields(&raw)->AddFixed32(900, 1024); + raw.GetReflection()->MutableUnknownFields(&raw)->AddFixed64(901, 1024); + raw.GetReflection()->MutableUnknownFields(&raw)->AddVarint(902, 1024); + raw.GetReflection()->MutableUnknownFields(&raw)->AddLengthDelimited(903, "foobar"); + // Known field comes at end + raw.GetReflection()->MutableUnknownFields(&raw)->AddLengthDelimited( + pb::FlightData::kDataBodyFieldNumber, "data"); + + auto serialized = raw.SerializeAsString(); + + grpc_slice slice = grpc_slice_from_copied_buffer(serialized.data(), serialized.size()); + // gRPC requires that grpc_slice and grpc::Slice have the same representation + grpc::ByteBuffer buffer(reinterpret_cast(&slice), /*nslices=*/1); + + flight::internal::FlightData out; + auto status = flight::transport::grpc::FlightDataDeserialize(&buffer, &out); + ASSERT_TRUE(status.ok()); + ASSERT_EQ("data", out.body->ToString()); + + grpc_slice_unref(slice); +#else + GTEST_SKIP() << "Can't use Protobuf symbols on Windows"; +#endif +} + // ---------------------------------------------------------------------- // Transport abstraction tests diff --git a/cpp/src/arrow/flight/transport/grpc/serialization_internal.cc b/cpp/src/arrow/flight/transport/grpc/serialization_internal.cc index cff111d64df91..372dca7a2c4c8 100644 --- a/cpp/src/arrow/flight/transport/grpc/serialization_internal.cc +++ b/cpp/src/arrow/flight/transport/grpc/serialization_internal.cc @@ -21,6 +21,7 @@ #include #include +#include #include #include @@ -297,7 +298,7 @@ ::grpc::Status FlightDataSerialize(const FlightPayload& msg, ByteBuffer* out, const auto remainder = static_cast( bit_util::RoundUpToMultipleOf8(buffer->size()) - buffer->size()); if (remainder) { - slices.push_back(::grpc::Slice(kPaddingBytes, remainder)); + slices.emplace_back(kPaddingBytes, remainder); } } } @@ -316,7 +317,7 @@ ::grpc::Status FlightDataSerialize(const FlightPayload& msg, ByteBuffer* out, ::grpc::Status FlightDataDeserialize(ByteBuffer* buffer, arrow::flight::internal::FlightData* out) { if (!buffer) { - return ::grpc::Status(::grpc::StatusCode::INTERNAL, "No payload"); + return {::grpc::StatusCode::INTERNAL, "No payload"}; } // Reset fields in case the caller reuses a single allocation @@ -342,42 +343,45 @@ ::grpc::Status FlightDataDeserialize(ByteBuffer* buffer, pb::FlightDescriptor pb_descriptor; uint32_t length; if (!pb_stream.ReadVarint32(&length)) { - return ::grpc::Status(::grpc::StatusCode::INTERNAL, - "Unable to parse length of FlightDescriptor"); + return {::grpc::StatusCode::INTERNAL, + "Unable to parse length of FlightDescriptor"}; } // Can't use ParseFromCodedStream as this reads the entire // rest of the stream into the descriptor command field. std::string buffer; pb_stream.ReadString(&buffer, length); if (!pb_descriptor.ParseFromString(buffer)) { - return ::grpc::Status(::grpc::StatusCode::INTERNAL, - "Unable to parse FlightDescriptor"); + return {::grpc::StatusCode::INTERNAL, "Unable to parse FlightDescriptor"}; } arrow::flight::FlightDescriptor descriptor; GRPC_RETURN_NOT_OK( arrow::flight::internal::FromProto(pb_descriptor, &descriptor)); - out->descriptor.reset(new arrow::flight::FlightDescriptor(descriptor)); + out->descriptor = std::make_unique(descriptor); } break; case pb::FlightData::kDataHeaderFieldNumber: { if (!ReadBytesZeroCopy(wrapped_buffer, &pb_stream, &out->metadata)) { - return ::grpc::Status(::grpc::StatusCode::INTERNAL, - "Unable to read FlightData metadata"); + return {::grpc::StatusCode::INTERNAL, "Unable to read FlightData metadata"}; } } break; case pb::FlightData::kAppMetadataFieldNumber: { if (!ReadBytesZeroCopy(wrapped_buffer, &pb_stream, &out->app_metadata)) { - return ::grpc::Status(::grpc::StatusCode::INTERNAL, - "Unable to read FlightData application metadata"); + return {::grpc::StatusCode::INTERNAL, + "Unable to read FlightData application metadata"}; } } break; case pb::FlightData::kDataBodyFieldNumber: { if (!ReadBytesZeroCopy(wrapped_buffer, &pb_stream, &out->body)) { - return ::grpc::Status(::grpc::StatusCode::INTERNAL, - "Unable to read FlightData body"); + return {::grpc::StatusCode::INTERNAL, "Unable to read FlightData body"}; } } break; - default: - DCHECK(false) << "cannot happen"; + default: { + // Unknown field. We should skip it for compatibility. + if (!WireFormatLite::SkipField(&pb_stream, tag)) { + return {::grpc::StatusCode::INTERNAL, + "Could not skip unknown field tag in FlightData"}; + } + break; + } } } buffer->Clear(); From 836aa1e5ccaa20fb5d3b4b7f44adc84642ff0e56 Mon Sep 17 00:00:00 2001 From: mwish Date: Wed, 2 Aug 2023 11:23:44 +0800 Subject: [PATCH 084/749] GH-36970: [C++][Parquet] Minor style fix for parquet metadata (#36971) ### Rationale for this change Minor style fix ### What changes are included in this PR? 1. Change `new` to `make_unique` or `make_shared` 2. Add a reserve for `column_order` ### Are these changes tested? Not required. ### Are there any user-facing changes? no * Closes: #36970 Authored-by: mwish Signed-off-by: Gang Wu --- cpp/src/parquet/metadata.cc | 15 ++++++++------- cpp/src/parquet/thrift_internal.h | 3 +-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 541bcc18b8bc3..8aedf5b926add 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -593,13 +593,13 @@ class FileMetaData::FileMetaDataImpl { FileMetaDataImpl() = default; explicit FileMetaDataImpl( - const void* metadata, uint32_t* metadata_len, const ReaderProperties& properties, + const void* metadata, uint32_t* metadata_len, ReaderProperties properties, std::shared_ptr file_decryptor = nullptr) - : properties_(properties), file_decryptor_(file_decryptor) { - metadata_.reset(new format::FileMetaData); + : properties_(std::move(properties)), file_decryptor_(std::move(file_decryptor)) { + metadata_ = std::make_unique(); auto footer_decryptor = - file_decryptor_ != nullptr ? file_decryptor->GetFooterDecryptor() : nullptr; + file_decryptor_ != nullptr ? file_decryptor_->GetFooterDecryptor() : nullptr; ThriftDeserializer deserializer(properties_); deserializer.DeserializeMessage(reinterpret_cast(metadata), @@ -779,8 +779,8 @@ class FileMetaData::FileMetaDataImpl { } std::shared_ptr out(new FileMetaData()); - out->impl_.reset(new FileMetaDataImpl()); - out->impl_->metadata_.reset(new format::FileMetaData()); + out->impl_ = std::make_unique(); + out->impl_->metadata_ = std::make_unique(); auto metadata = out->impl_->metadata_.get(); metadata->version = metadata_->version; @@ -834,6 +834,7 @@ class FileMetaData::FileMetaDataImpl { // update ColumnOrder std::vector column_orders; if (metadata_->__isset.column_orders) { + column_orders.reserve(metadata_->column_orders.size()); for (auto column_order : metadata_->column_orders) { if (column_order.__isset.TYPE_ORDER) { column_orders.push_back(ColumnOrder::type_defined_); @@ -865,7 +866,7 @@ std::shared_ptr FileMetaData::Make( std::shared_ptr file_decryptor) { // This FileMetaData ctor is private, not compatible with std::make_shared return std::shared_ptr( - new FileMetaData(metadata, metadata_len, properties, file_decryptor)); + new FileMetaData(metadata, metadata_len, properties, std::move(file_decryptor))); } std::shared_ptr FileMetaData::Make( diff --git a/cpp/src/parquet/thrift_internal.h b/cpp/src/parquet/thrift_internal.h index e9b859541b759..5824a82d5b86d 100644 --- a/cpp/src/parquet/thrift_internal.h +++ b/cpp/src/parquet/thrift_internal.h @@ -435,8 +435,7 @@ class ThriftDeserializer { #if PARQUET_THRIFT_VERSION_MAJOR > 0 || PARQUET_THRIFT_VERSION_MINOR >= 14 auto conf = std::make_shared(); conf->setMaxMessageSize(std::numeric_limits::max()); - return std::shared_ptr( - new ThriftBuffer(buf, len, ThriftBuffer::OBSERVE, conf)); + return std::make_shared(buf, len, ThriftBuffer::OBSERVE, conf); #else return std::make_shared(buf, len); #endif From 35019647b672e9a03e6b69aa6f1ad285be1c2aee Mon Sep 17 00:00:00 2001 From: Dane Pitkin <48041712+danepitkin@users.noreply.github.com> Date: Wed, 2 Aug 2023 15:20:08 -0400 Subject: [PATCH 085/749] GH-36069: [Java] Ensure S3 is finalized on shutdown (#36934) ### Rationale for this change Java datasets can implicitly create an S3 filesystem, which will initialize S3 APIs. There is currently no explicit call to shutdown S3 APIs in Java, which results in a warning message being printed at runtime: `arrow::fs::FinalizeS3 was not called even though S3 was initialized. This could lead to a segmentation fault at exit` ### What changes are included in this PR? * Add a Java runtime shutdown hook that calls `EnsureS3Finalized()` via JNI. This is a noop if S3 is uninitialized or already finalized. ### Are these changes tested? Yes, reproduced with: ``` import org.apache.arrow.dataset.file.FileFormat; import org.apache.arrow.dataset.file.FileSystemDatasetFactory; import org.apache.arrow.dataset.jni.NativeMemoryPool; import org.apache.arrow.dataset.source.DatasetFactory; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; public class DatasetModule { public static void main(String[] args) { String uri = "s3://voltrondata-labs-datasets/nyc-taxi-tiny/year=2022/month=2/part-0.parquet"; try ( BufferAllocator allocator = new RootAllocator(); DatasetFactory datasetFactory = new FileSystemDatasetFactory(allocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri); ) { // S3 is initialized } catch (Exception e) { e.printStackTrace(); } } } ``` I didn't think a unit test was worth adding. Let me know if you think otherwise. Reasoning: * We can't test the actual shutdown since thats a JVM thing. * We could test to see if the hook is registered, but that involves exposing the API and having access to the thread object registered with the hook. Or using reflection to obtain it. Not worth it IMO. * No need to test the functionality inside the hook, its just a wrapper around a single C++ API with no params/retval. ### Are there any user-facing changes? No * Closes: #36069 Authored-by: Dane Pitkin Signed-off-by: David Li --- java/dataset/src/main/cpp/jni_wrapper.cc | 13 +++++++++++++ .../org/apache/arrow/dataset/jni/JniLoader.java | 5 +++++ .../org/apache/arrow/dataset/jni/JniWrapper.java | 6 ++++++ 3 files changed, 24 insertions(+) diff --git a/java/dataset/src/main/cpp/jni_wrapper.cc b/java/dataset/src/main/cpp/jni_wrapper.cc index 871a2e95b94ec..5640bc4349670 100644 --- a/java/dataset/src/main/cpp/jni_wrapper.cc +++ b/java/dataset/src/main/cpp/jni_wrapper.cc @@ -27,6 +27,7 @@ #include "arrow/dataset/file_base.h" #include "arrow/filesystem/localfs.h" #include "arrow/filesystem/path_util.h" +#include "arrow/filesystem/s3fs.h" #include "arrow/engine/substrait/util.h" #include "arrow/ipc/api.h" #include "arrow/util/iterator.h" @@ -569,6 +570,18 @@ JNIEXPORT void JNICALL Java_org_apache_arrow_dataset_jni_JniWrapper_releaseBuffe JNI_METHOD_END() } +/* + * Class: org_apache_arrow_dataset_jni_JniWrapper + * Method: ensureS3Finalized + * Signature: (J)V + */ +JNIEXPORT void JNICALL Java_org_apache_arrow_dataset_jni_JniWrapper_ensureS3Finalized( + JNIEnv* env, jobject) { + JNI_METHOD_START + JniAssertOkOrThrow(arrow::fs::EnsureS3Finalized()); + JNI_METHOD_END() +} + /* * Class: org_apache_arrow_dataset_file_JniWrapper * Method: makeFileSystemDatasetFactory diff --git a/java/dataset/src/main/java/org/apache/arrow/dataset/jni/JniLoader.java b/java/dataset/src/main/java/org/apache/arrow/dataset/jni/JniLoader.java index 7ada21c058280..a3b31c73e8540 100644 --- a/java/dataset/src/main/java/org/apache/arrow/dataset/jni/JniLoader.java +++ b/java/dataset/src/main/java/org/apache/arrow/dataset/jni/JniLoader.java @@ -59,6 +59,7 @@ public void ensureLoaded() { return; } loadRemaining(); + ensureS3FinalizedOnShutdown(); } private synchronized void loadRemaining() { @@ -109,4 +110,8 @@ private String getNormalizedArch() { } return arch; } + + private void ensureS3FinalizedOnShutdown() { + Runtime.getRuntime().addShutdownHook(new Thread(() -> { JniWrapper.get().ensureS3Finalized(); })); + } } diff --git a/java/dataset/src/main/java/org/apache/arrow/dataset/jni/JniWrapper.java b/java/dataset/src/main/java/org/apache/arrow/dataset/jni/JniWrapper.java index 1a9d4188c168f..93cc5d7a37040 100644 --- a/java/dataset/src/main/java/org/apache/arrow/dataset/jni/JniWrapper.java +++ b/java/dataset/src/main/java/org/apache/arrow/dataset/jni/JniWrapper.java @@ -108,4 +108,10 @@ private JniWrapper() { * @param bufferId the native pointer of the arrow::Buffer instance. */ public native void releaseBuffer(long bufferId); + + /** + * Ensure the S3 APIs are shutdown, but only if not already done. If the S3 APIs are unintialized, + * then this is a noop. + */ + public native void ensureS3Finalized(); } From 0de6673e34f453fadd3841690bd14aa703a17001 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Wed, 2 Aug 2023 22:22:51 +0200 Subject: [PATCH 086/749] MINOR: [Doc][Release] Update wrong reference from JIRA to GitHub issues (#36989) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Some stray doc references from the JIRA to GitHub migration. ### What changes are included in this PR? A couple of docs changes on the mail and the Archery release curate report. ### Are these changes tested? Yes ### Are there any user-facing changes? No Authored-by: Raúl Cumplido Signed-off-by: Sutou Kouhei --- dev/archery/archery/templates/release_curation.txt.j2 | 2 +- dev/release/02-source-test.rb | 2 +- dev/release/02-source.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dev/archery/archery/templates/release_curation.txt.j2 b/dev/archery/archery/templates/release_curation.txt.j2 index 0796f451625f1..8e72290366bdd 100644 --- a/dev/archery/archery/templates/release_curation.txt.j2 +++ b/dev/archery/archery/templates/release_curation.txt.j2 @@ -39,7 +39,7 @@ {% for commit in noissue -%} - {{ commit.url }} {{ commit.title }} {% endfor %} -### JIRA issues in version {{ release.version }} without a linked patch: {{ nopatch|length }} +### GitHub issues in version {{ release.version }} without a linked patch: {{ nopatch|length }} {% for issue in nopatch -%} - https://github.com/apache/arrow/issues/{{ issue.key }} {% endfor %} diff --git a/dev/release/02-source-test.rb b/dev/release/02-source-test.rb index 1d0fd19d01b84..b9e6a8505b72b 100644 --- a/dev/release/02-source-test.rb +++ b/dev/release/02-source-test.rb @@ -134,7 +134,7 @@ def test_vote I would like to propose the following release candidate (RC0) of Apache Arrow version #{@release_version}. This is a release consisting of #{n_resolved_issues} -resolved JIRA issues[1]. +resolved GitHub issues[1]. This release candidate is based on commit: #{@current_commit} [2] diff --git a/dev/release/02-source.sh b/dev/release/02-source.sh index 1e54d6d10db76..e9cd7126361cd 100755 --- a/dev/release/02-source.sh +++ b/dev/release/02-source.sh @@ -168,7 +168,7 @@ Hi, I would like to propose the following release candidate (RC${rc}) of Apache Arrow version ${version}. This is a release consisting of ${n_resolved_issues} -resolved JIRA issues[1]. +resolved GitHub issues[1]. This release candidate is based on commit: ${release_hash} [2] From 710f96069f9f036338a0dd99202f0486bb7d1984 Mon Sep 17 00:00:00 2001 From: KarateSnowMachine Date: Thu, 3 Aug 2023 05:32:52 -0400 Subject: [PATCH 087/749] GH-36189: [C++][Parquet] StreamReader::SkipRows() skips to incorrect place in multi-row-group files (#36191) ### Rationale for this change The behavior of Parquet `StreamReader::SkipRows()` is wrong due to an error in calculating the row offset from the current row group. ### What changes are included in this PR? A unit test case demonstrating the failure and a trivial fix. ### Are these changes tested? Yes ### Are there any user-facing changes? No I am not sure if this bug is critical given how long it has existed in the code and no one has seemed to notice. There are two manifestations of this bug that might give the user the wrong impression about what is in their data: * sometimes a negative return value is returned, which is unexpected given the nature of the API, so the user should know something is up (this is how I discovered the bug) * the `SkipRows()` call leads to setting of the `eof` flag prematurely, which might lead the user to think there is less data in the file than there is. * Closes: #36189 Lead-authored-by: Paul Rosenfeld Co-authored-by: Gang Wu Co-authored-by: KarateSnowMachine Co-authored-by: Antoine Pitrou Signed-off-by: Gang Wu --- cpp/src/parquet/stream_reader.cc | 2 +- cpp/src/parquet/stream_reader_test.cc | 112 +++++++++++++++++++++++--- 2 files changed, 101 insertions(+), 13 deletions(-) diff --git a/cpp/src/parquet/stream_reader.cc b/cpp/src/parquet/stream_reader.cc index 0fecb1bf24615..d3353aa334256 100644 --- a/cpp/src/parquet/stream_reader.cc +++ b/cpp/src/parquet/stream_reader.cc @@ -441,7 +441,7 @@ int64_t StreamReader::SkipRows(int64_t num_rows_to_skip) { while (!eof_ && (num_rows_remaining_to_skip > 0)) { int64_t num_rows_in_row_group = row_group_reader_->metadata()->num_rows(); int64_t num_rows_remaining_in_row_group = - num_rows_in_row_group - current_row_ - row_group_row_offset_; + num_rows_in_row_group - (current_row_ - row_group_row_offset_); if (num_rows_remaining_in_row_group > num_rows_remaining_to_skip) { for (auto reader : column_readers_) { diff --git a/cpp/src/parquet/stream_reader_test.cc b/cpp/src/parquet/stream_reader_test.cc index fed036bca546a..04140f6ad0c3a 100644 --- a/cpp/src/parquet/stream_reader_test.cc +++ b/cpp/src/parquet/stream_reader_test.cc @@ -17,13 +17,11 @@ #include "parquet/stream_reader.h" -#include #include #include #include #include -#include #include "arrow/io/file.h" #include "arrow/util/decimal.h" @@ -38,7 +36,7 @@ using optional = StreamReader::optional; using ::std::nullopt; struct TestData { - static void init() { std::time(&ts_offset_); } + static void Init() { std::time(&ts_offset_); } static constexpr int num_rows = 2000; @@ -145,18 +143,18 @@ constexpr int TestData::num_rows; class TestStreamReader : public ::testing::Test { public: - TestStreamReader() { createTestFile(); } + TestStreamReader() { CreateTestFile(); } protected: const char* GetDataFile() const { return "stream_reader_test.parquet"; } - void SetUp() { + void SetUp() override { PARQUET_ASSIGN_OR_THROW(auto infile, ::arrow::io::ReadableFile::Open(GetDataFile())); auto file_reader = parquet::ParquetFileReader::Open(infile); reader_ = StreamReader{std::move(file_reader)}; } - void TearDown() { reader_ = StreamReader{}; } + void TearDown() override { reader_ = StreamReader{}; } std::shared_ptr GetSchema() { schema::NodeVector fields; @@ -201,7 +199,7 @@ class TestStreamReader : public ::testing::Test { schema::GroupNode::Make("schema", Repetition::REQUIRED, fields)); } - void createTestFile() { + void CreateTestFile() { PARQUET_ASSIGN_OR_THROW(auto outfile, ::arrow::io::FileOutputStream::Open(GetDataFile())); @@ -209,7 +207,7 @@ class TestStreamReader : public ::testing::Test { StreamWriter os{std::move(file_writer)}; - TestData::init(); + TestData::Init(); for (auto i = 0; i < TestData::num_rows; ++i) { os << TestData::GetBool(i); @@ -586,7 +584,7 @@ TEST_F(TestStreamReader, SkipColumns) { class TestOptionalFields : public ::testing::Test { public: - TestOptionalFields() { createTestFile(); } + TestOptionalFields() { CreateTestFile(); } protected: const char* GetDataFile() const { return "stream_reader_test_optional_fields.parquet"; } @@ -644,13 +642,13 @@ class TestOptionalFields : public ::testing::Test { schema::GroupNode::Make("schema", Repetition::REQUIRED, fields)); } - void createTestFile() { + void CreateTestFile() { PARQUET_ASSIGN_OR_THROW(auto outfile, ::arrow::io::FileOutputStream::Open(GetDataFile())); StreamWriter os{ParquetFileWriter::Open(outfile, GetSchema())}; - TestData::init(); + TestData::Init(); for (auto i = 0; i < TestData::num_rows; ++i) { os << TestData::GetOptBool(i); @@ -732,7 +730,7 @@ TEST_F(TestOptionalFields, ReadOptionalFieldAsRequiredField) { _provided_ that the optional value is available. This can be useful if a schema is changed such that a required - field beomes optional. Applications can continue reading the + field becomes optional. Applications can continue reading the field as if it were mandatory and do not need to be changed if the field value is always provided. @@ -947,5 +945,95 @@ TEST_F(TestReadingDataFiles, ByteArrayDecimal) { EXPECT_EQ(i, 25); } +class TestMultiRowGroupStreamReader : public ::testing::Test { + protected: + const char* GetDataFile() const { return "stream_reader_multirowgroup_test.parquet"; } + + void SetUp() override { + CreateTestFile(); + PARQUET_ASSIGN_OR_THROW(auto infile, ::arrow::io::ReadableFile::Open(GetDataFile())); + auto file_reader = parquet::ParquetFileReader::Open(infile); + reader_ = StreamReader{std::move(file_reader)}; + } + + void TearDown() override { reader_ = StreamReader{}; } + + std::shared_ptr GetSchema() { + schema::NodeVector fields; + fields.push_back(schema::PrimitiveNode::Make("row_group_number", Repetition::REQUIRED, + Type::INT32, ConvertedType::UINT_16)); + + fields.push_back(schema::PrimitiveNode::Make("row_number", Repetition::REQUIRED, + Type::INT64, ConvertedType::UINT_64)); + + return std::static_pointer_cast( + schema::GroupNode::Make("schema", Repetition::REQUIRED, fields)); + } + + void CreateTestFile() { + PARQUET_ASSIGN_OR_THROW(auto outfile, + ::arrow::io::FileOutputStream::Open(GetDataFile())); + + auto file_writer = ParquetFileWriter::Open(outfile, GetSchema()); + + StreamWriter os{std::move(file_writer)}; + + int nrows = 0; + for (auto group = 0; group < kNumGroups; ++group) { + for (auto i = 0; i < kNumRowsPerGroup; ++i) { + os << static_cast(group); + os << static_cast(nrows); + os << EndRow; + nrows++; + } + os.EndRowGroup(); + } + } + + void ReadRowAndAssertPosition(uint64_t expected_row_num) { + const auto expected_group_num = + static_cast(expected_row_num / kNumRowsPerGroup); + ASSERT_FALSE(reader_.eof()); + uint16_t group_num = 0; + uint64_t row_num = 0; + reader_ >> group_num >> row_num >> EndRow; + ASSERT_EQ(group_num, expected_group_num); + ASSERT_EQ(row_num, expected_row_num); + } + + StreamReader reader_; + static constexpr int kNumGroups = 5; + static constexpr int kNumRowsPerGroup = 10; +}; + +TEST_F(TestMultiRowGroupStreamReader, SkipRows) { + // skip somewhere into the middle of a row group somewhere in the middle of the file + auto current_row = 33; + + auto retval = reader_.SkipRows(current_row); + ASSERT_EQ(retval, current_row); + ReadRowAndAssertPosition(current_row); + // reading the row advances by 1 + current_row += 1; // row=34 + + // skip a few more but stay inside the row group + retval = reader_.SkipRows(4); + current_row += 4; // row=38 + ASSERT_EQ(retval, 4); + ReadRowAndAssertPosition(current_row); + current_row += 1; // row=39 + + // skip one more row to get to a group boundary + retval = reader_.SkipRows(1); + current_row += 1; // row=40 + ASSERT_EQ(retval, 1); + ReadRowAndAssertPosition(current_row); + + // finally, skip off the end of the file + retval = reader_.SkipRows(10); + ASSERT_EQ(retval, 9); // requested to skip 10 but only 9 rows left in file + EXPECT_TRUE(reader_.eof()); +} + } // namespace test } // namespace parquet From 36ddbb531cac9b9e512dfa3776d1d64db588209f Mon Sep 17 00:00:00 2001 From: sgilmore10 <74676073+sgilmore10@users.noreply.github.com> Date: Thu, 3 Aug 2023 12:28:27 -0400 Subject: [PATCH 088/749] GH-36953: [MATLAB] Add gateway `arrow.array` function to create Arrow Arrays from MATLAB data (#36978) ### Rationale for this change As discussed in #36855, we think it would be better to move the recommended APIs for the MATLAB Interface directly under the top-level `arrow.*` package. This should help simplify the interface, and will make it easier for users to switch between multiple language bindings. We have already moved the `type` convenience constructors to the `arrow` package. Now we want to add a gateway function that creates arrays to mirror `PyArrow`. As part of this change, we will modify the array constructors to accept `libmexclass.proxy.Proxy` objects - similar to how the `arrow.type.` constructors accept `libmexclass.proxy.Proxy` objects. ### What changes are included in this PR? 1. Added `arrow.array()` gateway function that can be used to construct arrays: ```matlab >> arrowArray = arrow.array([1 2 3 4]); >> class(arrowArray) ans = 'arrow.array.Float64Array' >> arrowArray = arrow.array(["A" "B" "C"]); >> class(arrowArray) ans = 'arrow.array.StringArray' ``` 2. Added a static `fromMATLAB()` method to all subclasses of`arrow.array.Array`. ```matlab >> array = arrow.array.StringArray.fromMATLAB(["A" "B" "C"]) array = [ "A", "B", "C" ] >> array = arrow.array.TimestampArray.fromMATLAB(datetime(2023, 8, 1)) array = [ 2023-08-01 00:00:00.000000 ] ``` As part of this change, users can no longer use the `arrow.array.Array` subclass constructors to create arrays. Instead, they can use either `arrow.array()` or the static `fromMATLAB` method. ### Are these changes tested? Updated the existing tests to account for the API changes and added the following new test classes: 1. arrow/internal/validate/tType.m 2. arrow/internal/validate/tShape.m 3. arrow/internal/validate/tRealNumeric.m 4. arrow/internal/validate/tNonsparse.m 5. arrow/internal/validate/tNumeric.m 6. arrow/array/tArray.m ### Are there any user-facing changes? Yes, we changed the signature of all `arrow.array.Array` subclasses to accept scalar `libmexclass.proxy.Proxy` classes. NOTE: The MATLAB interface is still under active development. ### Future Directions 1. In a followup PR, we plan on adding a new name-value pair to `arrow.array()` called `Type`, which can be set to an `arrow.type.Type` object. This will let users specify what kind of arrow array they would like to create from MATLAB data. * Closes: #36953 Authored-by: Sarah Gilmore Signed-off-by: Kevin Gurney --- matlab/src/matlab/+arrow/+array/Array.m | 7 +- .../src/matlab/+arrow/+array/BooleanArray.m | 33 +++++-- .../src/matlab/+arrow/+array/Float32Array.m | 16 +++- .../src/matlab/+arrow/+array/Float64Array.m | 16 +++- matlab/src/matlab/+arrow/+array/Int16Array.m | 16 +++- matlab/src/matlab/+arrow/+array/Int32Array.m | 16 +++- matlab/src/matlab/+arrow/+array/Int64Array.m | 16 +++- matlab/src/matlab/+arrow/+array/Int8Array.m | 16 +++- .../src/matlab/+arrow/+array/NumericArray.m | 30 ++++--- matlab/src/matlab/+arrow/+array/StringArray.m | 43 ++++++--- .../src/matlab/+arrow/+array/TimestampArray.m | 38 +++++--- matlab/src/matlab/+arrow/+array/UInt16Array.m | 16 +++- matlab/src/matlab/+arrow/+array/UInt32Array.m | 16 +++- matlab/src/matlab/+arrow/+array/UInt64Array.m | 16 +++- matlab/src/matlab/+arrow/+array/UInt8Array.m | 16 +++- .../+arrow/+internal/+validate/nonsparse.m | 25 ++++++ .../+arrow/+internal/+validate/numeric.m | 24 +++++ .../+validate}/parseValidElements.m | 0 .../+arrow/+internal/+validate/realnumeric.m | 25 ++++++ .../matlab/+arrow/+internal/+validate/shape.m | 26 ++++++ .../matlab/+arrow/+internal/+validate/type.m | 25 ++++++ .../src/matlab/+arrow/+tabular/RecordBatch.m | 41 +-------- matlab/src/matlab/+arrow/array.m | 67 ++++++++++++++ matlab/test/arrow/array/hNumericArray.m | 43 ++++----- matlab/test/arrow/array/tArray.m | 88 +++++++++++++++++++ matlab/test/arrow/array/tBooleanArray.m | 49 ++++++----- matlab/test/arrow/array/tFloat32Array.m | 24 ++--- matlab/test/arrow/array/tFloat64Array.m | 28 +++--- matlab/test/arrow/array/tInt16Array.m | 2 +- matlab/test/arrow/array/tInt32Array.m | 2 +- matlab/test/arrow/array/tInt64Array.m | 2 +- matlab/test/arrow/array/tInt8Array.m | 2 +- matlab/test/arrow/array/tStringArray.m | 65 +++++++------- matlab/test/arrow/array/tTimestampArray.m | 64 ++++++++------ matlab/test/arrow/array/tUInt16Array.m | 2 +- matlab/test/arrow/array/tUInt32Array.m | 2 +- matlab/test/arrow/array/tUInt64Array.m | 2 +- matlab/test/arrow/array/tUInt8Array.m | 2 +- .../test/arrow/internal/validate/tNonsparse.m | 44 ++++++++++ .../test/arrow/internal/validate/tNumeric.m | 56 ++++++++++++ .../validate}/tParseValidElements.m | 2 +- .../arrow/internal/validate/tRealNumeric.m | 43 +++++++++ matlab/test/arrow/internal/validate/tShape.m | 69 +++++++++++++++ .../arrow/internal/validate/tType.m} | 34 ++++--- 44 files changed, 893 insertions(+), 276 deletions(-) create mode 100644 matlab/src/matlab/+arrow/+internal/+validate/nonsparse.m create mode 100644 matlab/src/matlab/+arrow/+internal/+validate/numeric.m rename matlab/src/matlab/+arrow/{+args => +internal/+validate}/parseValidElements.m (100%) create mode 100644 matlab/src/matlab/+arrow/+internal/+validate/realnumeric.m create mode 100644 matlab/src/matlab/+arrow/+internal/+validate/shape.m create mode 100644 matlab/src/matlab/+arrow/+internal/+validate/type.m create mode 100644 matlab/src/matlab/+arrow/array.m create mode 100644 matlab/test/arrow/array/tArray.m create mode 100644 matlab/test/arrow/internal/validate/tNonsparse.m create mode 100644 matlab/test/arrow/internal/validate/tNumeric.m rename matlab/test/arrow/{args => internal/validate}/tParseValidElements.m (99%) create mode 100644 matlab/test/arrow/internal/validate/tRealNumeric.m create mode 100644 matlab/test/arrow/internal/validate/tShape.m rename matlab/{src/matlab/+arrow/+args/validateTypeAndShape.m => test/arrow/internal/validate/tType.m} (55%) diff --git a/matlab/src/matlab/+arrow/+array/Array.m b/matlab/src/matlab/+arrow/+array/Array.m index 7426052764166..46b46660e7cf3 100644 --- a/matlab/src/matlab/+arrow/+array/Array.m +++ b/matlab/src/matlab/+arrow/+array/Array.m @@ -31,8 +31,11 @@ end methods - function obj = Array(varargin) - obj.Proxy = libmexclass.proxy.Proxy(varargin{:}); + function obj = Array(proxy) + arguments + proxy(1, 1) libmexclass.proxy.Proxy + end + obj.Proxy = proxy; end function numElements = get.Length(obj) diff --git a/matlab/src/matlab/+arrow/+array/BooleanArray.m b/matlab/src/matlab/+arrow/+array/BooleanArray.m index f4d341efce9d3..b9ef36b5a70c9 100644 --- a/matlab/src/matlab/+arrow/+array/BooleanArray.m +++ b/matlab/src/matlab/+arrow/+array/BooleanArray.m @@ -21,16 +21,12 @@ end methods - function obj = BooleanArray(data, opts) - arguments - data - opts.InferNulls(1,1) logical = true - opts.Valid + function obj = BooleanArray(proxy) + arguments + proxy(1, 1) libmexclass.proxy.Proxy {validate(proxy, "arrow.array.proxy.BooleanArray")} end - arrow.args.validateTypeAndShape(data, "logical"); - validElements = arrow.args.parseValidElements(data, opts); - opts = struct(MatlabArray=data, Valid=validElements); - obj@arrow.array.Array("Name", "arrow.array.proxy.BooleanArray", "ConstructorArguments", {opts}); + import arrow.internal.proxy.validate + obj@arrow.array.Array(proxy); end function data = logical(obj) @@ -42,4 +38,23 @@ matlabArray(~obj.Valid) = obj.NullSubstitionValue; end end + + methods (Static) + function array = fromMATLAB(data, opts) + arguments + data + opts.InferNulls(1, 1) logical = true + opts.Valid + end + + arrow.internal.validate.type(data, "logical"); + arrow.internal.validate.shape(data); + arrow.internal.validate.nonsparse(data); + validElements = arrow.internal.validate.parseValidElements(data, opts); + + args = struct(MatlabArray=data, Valid=validElements); + proxy = arrow.internal.proxy.create("arrow.array.proxy.BooleanArray", args); + array = arrow.array.BooleanArray(proxy); + end + end end diff --git a/matlab/src/matlab/+arrow/+array/Float32Array.m b/matlab/src/matlab/+arrow/+array/Float32Array.m index c6be563d8621f..fe90db335b5aa 100644 --- a/matlab/src/matlab/+arrow/+array/Float32Array.m +++ b/matlab/src/matlab/+arrow/+array/Float32Array.m @@ -21,13 +21,23 @@ end methods - function obj = Float32Array(data, varargin) - obj@arrow.array.NumericArray(data, "single", ... - "arrow.array.proxy.Float32Array", varargin{:}); + function obj = Float32Array(proxy) + arguments + proxy(1, 1) libmexclass.proxy.Proxy {validate(proxy, "arrow.array.proxy.Float32Array")} + end + import arrow.internal.proxy.validate + obj@arrow.array.NumericArray(proxy); end function data = single(obj) data = obj.toMATLAB(); end end + + methods (Static) + function array = fromMATLAB(data, varargin) + traits = arrow.type.traits.Float32Traits; + array = arrow.array.NumericArray.fromMATLAB(data, traits, varargin{:}); + end + end end diff --git a/matlab/src/matlab/+arrow/+array/Float64Array.m b/matlab/src/matlab/+arrow/+array/Float64Array.m index ff43ebc0536c0..ecf91e28954b5 100644 --- a/matlab/src/matlab/+arrow/+array/Float64Array.m +++ b/matlab/src/matlab/+arrow/+array/Float64Array.m @@ -21,13 +21,23 @@ end methods - function obj = Float64Array(data, varargin) - obj@arrow.array.NumericArray(data, "double", ... - "arrow.array.proxy.Float64Array", varargin{:}); + function obj = Float64Array(proxy) + arguments + proxy(1, 1) libmexclass.proxy.Proxy {validate(proxy, "arrow.array.proxy.Float64Array")} + end + import arrow.internal.proxy.validate + obj@arrow.array.NumericArray(proxy); end function data = double(obj) data = obj.toMATLAB(); end end + + methods (Static) + function array = fromMATLAB(data, varargin) + traits = arrow.type.traits.Float64Traits; + array = arrow.array.NumericArray.fromMATLAB(data, traits, varargin{:}); + end + end end diff --git a/matlab/src/matlab/+arrow/+array/Int16Array.m b/matlab/src/matlab/+arrow/+array/Int16Array.m index 533f0c9ef549d..53c96c6eeb85c 100644 --- a/matlab/src/matlab/+arrow/+array/Int16Array.m +++ b/matlab/src/matlab/+arrow/+array/Int16Array.m @@ -21,13 +21,23 @@ end methods - function obj = Int16Array(data, varargin) - obj@arrow.array.NumericArray(data, "int16", ... - "arrow.array.proxy.Int16Array", varargin{:}); + function obj = Int16Array(proxy) + arguments + proxy(1, 1) libmexclass.proxy.Proxy {validate(proxy, "arrow.array.proxy.Int16Array")} + end + import arrow.internal.proxy.validate + obj@arrow.array.NumericArray(proxy); end function data = int16(obj) data = obj.toMATLAB(); end end + + methods (Static) + function array = fromMATLAB(data, varargin) + traits = arrow.type.traits.Int16Traits; + array = arrow.array.NumericArray.fromMATLAB(data, traits, varargin{:}); + end + end end diff --git a/matlab/src/matlab/+arrow/+array/Int32Array.m b/matlab/src/matlab/+arrow/+array/Int32Array.m index 0f977fb90f808..d85bcaf627f7b 100644 --- a/matlab/src/matlab/+arrow/+array/Int32Array.m +++ b/matlab/src/matlab/+arrow/+array/Int32Array.m @@ -21,13 +21,23 @@ end methods - function obj = Int32Array(data, varargin) - obj@arrow.array.NumericArray(data, "int32", ... - "arrow.array.proxy.Int32Array", varargin{:}); + function obj = Int32Array(proxy) + arguments + proxy(1, 1) libmexclass.proxy.Proxy {validate(proxy, "arrow.array.proxy.Int32Array")} + end + import arrow.internal.proxy.validate + obj@arrow.array.NumericArray(proxy); end function data = int32(obj) data = obj.toMATLAB(); end end + + methods (Static) + function array = fromMATLAB(data, varargin) + traits = arrow.type.traits.Int32Traits; + array = arrow.array.NumericArray.fromMATLAB(data, traits, varargin{:}); + end + end end diff --git a/matlab/src/matlab/+arrow/+array/Int64Array.m b/matlab/src/matlab/+arrow/+array/Int64Array.m index 94cad56519b11..72199df88ded1 100644 --- a/matlab/src/matlab/+arrow/+array/Int64Array.m +++ b/matlab/src/matlab/+arrow/+array/Int64Array.m @@ -21,13 +21,23 @@ end methods - function obj = Int64Array(data, varargin) - obj@arrow.array.NumericArray(data, "int64", ... - "arrow.array.proxy.Int64Array", varargin{:}); + function obj = Int64Array(proxy) + arguments + proxy(1, 1) libmexclass.proxy.Proxy {validate(proxy, "arrow.array.proxy.Int64Array")} + end + import arrow.internal.proxy.validate + obj@arrow.array.NumericArray(proxy); end function data = int64(obj) data = obj.toMATLAB(); end end + + methods (Static) + function array = fromMATLAB(data, varargin) + traits = arrow.type.traits.Int64Traits; + array = arrow.array.NumericArray.fromMATLAB(data, traits, varargin{:}); + end + end end diff --git a/matlab/src/matlab/+arrow/+array/Int8Array.m b/matlab/src/matlab/+arrow/+array/Int8Array.m index 83a14caa27287..0e9d8eec0edf5 100644 --- a/matlab/src/matlab/+arrow/+array/Int8Array.m +++ b/matlab/src/matlab/+arrow/+array/Int8Array.m @@ -21,13 +21,23 @@ end methods - function obj = Int8Array(data, varargin) - obj@arrow.array.NumericArray(data, "int8", ... - "arrow.array.proxy.Int8Array", varargin{:}); + function obj = Int8Array(proxy) + arguments + proxy(1, 1) libmexclass.proxy.Proxy {validate(proxy, "arrow.array.proxy.Int8Array")} + end + import arrow.internal.proxy.validate + obj@arrow.array.NumericArray(proxy); end function data = int8(obj) data = obj.toMATLAB(); end end + + methods (Static) + function array = fromMATLAB(data, varargin) + traits = arrow.type.traits.Int8Traits; + array = arrow.array.NumericArray.fromMATLAB(data, traits, varargin{:}); + end + end end diff --git a/matlab/src/matlab/+arrow/+array/NumericArray.m b/matlab/src/matlab/+arrow/+array/NumericArray.m index fb2fc1d333939..8f465ce425e23 100644 --- a/matlab/src/matlab/+arrow/+array/NumericArray.m +++ b/matlab/src/matlab/+arrow/+array/NumericArray.m @@ -21,18 +21,11 @@ end methods - function obj = NumericArray(data, type, proxyName, opts) + function obj = NumericArray(proxy) arguments - data - type(1, 1) string - proxyName(1, 1) string - opts.InferNulls(1, 1) logical = true - opts.Valid + proxy(1, 1) libmexclass.proxy.Proxy end - arrow.args.validateTypeAndShape(data, type); - validElements = arrow.args.parseValidElements(data, opts); - opts = struct(MatlabArray=data, Valid=validElements); - obj@arrow.array.Array("Name", proxyName, "ConstructorArguments", {opts}); + obj@arrow.array.Array(proxy); end function matlabArray = toMATLAB(obj) @@ -40,5 +33,22 @@ matlabArray(~obj.Valid) = obj.NullSubstitutionValue; end end + + methods (Static) + function array = fromMATLAB(data, traits, opts) + arguments + data + traits(1, 1) arrow.type.traits.TypeTraits + opts.InferNulls(1, 1) logical = true + opts.Valid + end + + arrow.internal.validate.numeric(data, traits.MatlabClassName); + validElements = arrow.internal.validate.parseValidElements(data, opts); + args = struct(MatlabArray=data, Valid=validElements); + proxy = arrow.internal.proxy.create(traits.ArrayProxyClassName, args); + array = traits.ArrayConstructor(proxy); + end + end end diff --git a/matlab/src/matlab/+arrow/+array/StringArray.m b/matlab/src/matlab/+arrow/+array/StringArray.m index ec2d53b371fe2..18fdec9ac70c3 100644 --- a/matlab/src/matlab/+arrow/+array/StringArray.m +++ b/matlab/src/matlab/+arrow/+array/StringArray.m @@ -21,21 +21,12 @@ end methods - function obj = StringArray(data, opts) - arguments - data - opts.InferNulls(1,1) logical = true - opts.Valid + function obj = StringArray(proxy) + arguments + proxy(1, 1) libmexclass.proxy.Proxy {validate(proxy, "arrow.array.proxy.StringArray")} end - % Support constructing a StringArray from a cell array of strings (i.e. cellstr), - % or a string array, but not a char array. - if ~ischar(data) - data = convertCharsToStrings(data); - end - arrow.args.validateTypeAndShape(data, "string"); - validElements = arrow.args.parseValidElements(data, opts); - opts = struct(MatlabArray=data, Valid=validElements); - obj@arrow.array.Array("Name", "arrow.array.proxy.StringArray", "ConstructorArguments", {opts}); + import arrow.internal.proxy.validate + obj@arrow.array.Array(proxy); end function data = string(obj) @@ -47,4 +38,28 @@ matlabArray(~obj.Valid) = obj.NullSubstitionValue; end end + + methods (Static) + function array = fromMATLAB(data, opts) + arguments + data + opts.InferNulls(1, 1) logical = true + opts.Valid + end + + % Support constructing a StringArray from a cell array of strings + % (i.e. cellstr), or a string array, but not a char array. + if ~ischar(data) + data = convertCharsToStrings(data); + end + + arrow.internal.validate.type(data, "string"); + arrow.internal.validate.shape(data); + validElements = arrow.internal.validate.parseValidElements(data, opts); + + args = struct(MatlabArray=data, Valid=validElements); + proxy = arrow.internal.proxy.create("arrow.array.proxy.StringArray", args); + array = arrow.array.StringArray(proxy); + end + end end diff --git a/matlab/src/matlab/+arrow/+array/TimestampArray.m b/matlab/src/matlab/+arrow/+array/TimestampArray.m index 0f0da4e82130c..3b05903bb13ea 100644 --- a/matlab/src/matlab/+arrow/+array/TimestampArray.m +++ b/matlab/src/matlab/+arrow/+array/TimestampArray.m @@ -21,20 +21,12 @@ end methods - function obj = TimestampArray(data, opts) + function obj = TimestampArray(proxy) arguments - data - opts.TimeUnit(1, 1) arrow.type.TimeUnit = arrow.type.TimeUnit.Microsecond - opts.InferNulls(1, 1) logical = true - opts.Valid + proxy(1, 1) libmexclass.proxy.Proxy {validate(proxy, "arrow.array.proxy.TimestampArray")} end - arrow.args.validateTypeAndShape(data, "datetime"); - validElements = arrow.args.parseValidElements(data, opts); - ptime = arrow.array.TimestampArray.convertToEpochTime(data, opts.TimeUnit); - timezone = string(data.TimeZone); - - args = struct(MatlabArray=ptime, Valid=validElements, TimeZone=timezone, TimeUnit=string(opts.TimeUnit)); - obj@arrow.array.Array("Name", "arrow.array.proxy.TimestampArray", "ConstructorArguments", {args}); + import arrow.internal.proxy.validate + obj@arrow.array.Array(proxy); end function dates = toMATLAB(obj) @@ -70,4 +62,26 @@ time(indices) = convertTo(dates(indices), "epochtime", TicksPerSecond=ticksPerSecond(units)); end end + + methods(Static) + function array = fromMATLAB(data, opts) + arguments + data + opts.TimeUnit(1, 1) arrow.type.TimeUnit = arrow.type.TimeUnit.Microsecond + opts.InferNulls(1, 1) logical = true + opts.Valid + end + + arrow.internal.validate.type(data, "datetime"); + arrow.internal.validate.shape(data); + + validElements = arrow.internal.validate.parseValidElements(data, opts); + epochTime = arrow.array.TimestampArray.convertToEpochTime(data, opts.TimeUnit); + timezone = string(data.TimeZone); + + args = struct(MatlabArray=epochTime, Valid=validElements, TimeZone=timezone, TimeUnit=string(opts.TimeUnit)); + proxy = arrow.internal.proxy.create("arrow.array.proxy.TimestampArray", args); + array = arrow.array.TimestampArray(proxy); + end + end end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+array/UInt16Array.m b/matlab/src/matlab/+arrow/+array/UInt16Array.m index 4862ca20b9f88..9d3f33c279175 100644 --- a/matlab/src/matlab/+arrow/+array/UInt16Array.m +++ b/matlab/src/matlab/+arrow/+array/UInt16Array.m @@ -21,13 +21,23 @@ end methods - function obj = UInt16Array(data, varargin) - obj@arrow.array.NumericArray(data, "uint16", ... - "arrow.array.proxy.UInt16Array", varargin{:}); + function obj = UInt16Array(proxy) + arguments + proxy(1, 1) libmexclass.proxy.Proxy {validate(proxy, "arrow.array.proxy.UInt16Array")} + end + import arrow.internal.proxy.validate + obj@arrow.array.NumericArray(proxy); end function data = uint16(obj) data = obj.toMATLAB(); end end + + methods (Static) + function array = fromMATLAB(data, varargin) + traits = arrow.type.traits.UInt16Traits; + array = arrow.array.NumericArray.fromMATLAB(data, traits, varargin{:}); + end + end end diff --git a/matlab/src/matlab/+arrow/+array/UInt32Array.m b/matlab/src/matlab/+arrow/+array/UInt32Array.m index 782b0010997fc..5235d4fb15576 100644 --- a/matlab/src/matlab/+arrow/+array/UInt32Array.m +++ b/matlab/src/matlab/+arrow/+array/UInt32Array.m @@ -21,13 +21,23 @@ end methods - function obj = UInt32Array(data, varargin) - obj@arrow.array.NumericArray(data, "uint32", ... - "arrow.array.proxy.UInt32Array", varargin{:}); + function obj = UInt32Array(proxy) + arguments + proxy(1, 1) libmexclass.proxy.Proxy {validate(proxy, "arrow.array.proxy.UInt32Array")} + end + import arrow.internal.proxy.validate + obj@arrow.array.NumericArray(proxy); end function data = uint32(obj) data = obj.toMATLAB(); end end + + methods (Static) + function array = fromMATLAB(data, varargin) + traits = arrow.type.traits.UInt32Traits; + array = arrow.array.NumericArray.fromMATLAB(data, traits, varargin{:}); + end + end end diff --git a/matlab/src/matlab/+arrow/+array/UInt64Array.m b/matlab/src/matlab/+arrow/+array/UInt64Array.m index 9e25ce4987bc1..2d69bd031ac31 100644 --- a/matlab/src/matlab/+arrow/+array/UInt64Array.m +++ b/matlab/src/matlab/+arrow/+array/UInt64Array.m @@ -21,13 +21,23 @@ end methods - function obj = UInt64Array(data, varargin) - obj@arrow.array.NumericArray(data, "uint64", ... - "arrow.array.proxy.UInt64Array", varargin{:}); + function obj = UInt64Array(proxy) + arguments + proxy(1, 1) libmexclass.proxy.Proxy {validate(proxy, "arrow.array.proxy.UInt64Array")} + end + import arrow.internal.proxy.validate + obj@arrow.array.NumericArray(proxy); end function data = uint64(obj) data = obj.toMATLAB(); end end + + methods (Static) + function array = fromMATLAB(data, varargin) + traits = arrow.type.traits.UInt64Traits; + array = arrow.array.NumericArray.fromMATLAB(data, traits, varargin{:}); + end + end end diff --git a/matlab/src/matlab/+arrow/+array/UInt8Array.m b/matlab/src/matlab/+arrow/+array/UInt8Array.m index 8bad2401bd429..3d007376bc89a 100644 --- a/matlab/src/matlab/+arrow/+array/UInt8Array.m +++ b/matlab/src/matlab/+arrow/+array/UInt8Array.m @@ -21,13 +21,23 @@ end methods - function obj = UInt8Array(data, varargin) - obj@arrow.array.NumericArray(data, "uint8", ... - "arrow.array.proxy.UInt8Array", varargin{:}); + function obj = UInt8Array(proxy) + arguments + proxy(1, 1) libmexclass.proxy.Proxy {validate(proxy, "arrow.array.proxy.UInt8Array")} + end + import arrow.internal.proxy.validate + obj@arrow.array.NumericArray(proxy); end function data = uint8(obj) data = obj.toMATLAB(); end end + + methods (Static) + function array = fromMATLAB(data, varargin) + traits = arrow.type.traits.UInt8Traits; + array = arrow.array.NumericArray.fromMATLAB(data, traits, varargin{:}); + end + end end diff --git a/matlab/src/matlab/+arrow/+internal/+validate/nonsparse.m b/matlab/src/matlab/+arrow/+internal/+validate/nonsparse.m new file mode 100644 index 0000000000000..8f7557c18b658 --- /dev/null +++ b/matlab/src/matlab/+arrow/+internal/+validate/nonsparse.m @@ -0,0 +1,25 @@ +%NONSPARESE Verifies data is nonsparse. Otherwise throws an error with the +% identifier "arrrow:array:Sparse". + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +function nonsparse(data) + if issparse(data) + errid = "arrow:array:Sparse"; + msg = "Sparse arrays are not supported."; + error(errid, msg); + end +end diff --git a/matlab/src/matlab/+arrow/+internal/+validate/numeric.m b/matlab/src/matlab/+arrow/+internal/+validate/numeric.m new file mode 100644 index 0000000000000..9dfe6dfada2de --- /dev/null +++ b/matlab/src/matlab/+arrow/+internal/+validate/numeric.m @@ -0,0 +1,24 @@ +%NUMERIC Validates data is a real, nonsparse, numeric array that is +%either a vector or an empty array. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. +function numeric(data, type) + arrow.internal.validate.type(data, type); + arrow.internal.validate.shape(data); + arrow.internal.validate.nonsparse(data); + arrow.internal.validate.realnumeric(data); +end + diff --git a/matlab/src/matlab/+arrow/+args/parseValidElements.m b/matlab/src/matlab/+arrow/+internal/+validate/parseValidElements.m similarity index 100% rename from matlab/src/matlab/+arrow/+args/parseValidElements.m rename to matlab/src/matlab/+arrow/+internal/+validate/parseValidElements.m diff --git a/matlab/src/matlab/+arrow/+internal/+validate/realnumeric.m b/matlab/src/matlab/+arrow/+internal/+validate/realnumeric.m new file mode 100644 index 0000000000000..1f57cee4d8622 --- /dev/null +++ b/matlab/src/matlab/+arrow/+internal/+validate/realnumeric.m @@ -0,0 +1,25 @@ +%REALNUMERIC Verifies the numeric array data is real. Otherwise throws an +% error with the identifier "arrrow:array:ComplexNumeric". + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +function realnumeric(data) + if ~isreal(data) + errid = "arrow:array:ComplexNumeric"; + msg = "Complex numeric arrays are not supported."; + error(errid, msg); + end +end diff --git a/matlab/src/matlab/+arrow/+internal/+validate/shape.m b/matlab/src/matlab/+arrow/+internal/+validate/shape.m new file mode 100644 index 0000000000000..d9a8e29076893 --- /dev/null +++ b/matlab/src/matlab/+arrow/+internal/+validate/shape.m @@ -0,0 +1,26 @@ +%SHAPE Verifies data is either a vector or empty. Otherwise throws an error +% with the identifier "arrrow:array:InvalidShape". + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +function shape(data) + if ~isvector(data) && ~isempty(data) + errid = "arrow:array:InvalidShape"; + msg = "Expected input array to be a vector or empty."; + error(errid, msg); + end +end + diff --git a/matlab/src/matlab/+arrow/+internal/+validate/type.m b/matlab/src/matlab/+arrow/+internal/+validate/type.m new file mode 100644 index 0000000000000..7977d812adbf6 --- /dev/null +++ b/matlab/src/matlab/+arrow/+internal/+validate/type.m @@ -0,0 +1,25 @@ +%TYPE Verifies data has the expected class type. Otherwise throws an +% error with the identifier "arrrow:array:InvalidType". + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +function type(data, classtype) + if ~isa(data, classtype) + errid = "arrow:array:InvalidType"; + msg = join(["Expected data to be a", classtype, "array"]); + error(errid, msg); + end +end diff --git a/matlab/src/matlab/+arrow/+tabular/RecordBatch.m b/matlab/src/matlab/+arrow/+tabular/RecordBatch.m index a7feb0c0a3bd7..9af09702e1cf5 100644 --- a/matlab/src/matlab/+arrow/+tabular/RecordBatch.m +++ b/matlab/src/matlab/+arrow/+tabular/RecordBatch.m @@ -89,49 +89,10 @@ % Convert each MATLAB array into a corresponding % arrow.array.Array. for ii = 1:numColumns - arrowArrays{ii} = arrow.tabular.RecordBatch.makeArray(T{:, ii}); + arrowArrays{ii} = arrow.array(T{:, ii}); end end - function arrowArray = makeArray(matlabArray) - % Decompose the input MATLAB table - % input a cell array of equivalent arrow.array.Array - % instances. - - switch class(matlabArray) - case "single" - arrowArray = arrow.array.Float32Array(matlabArray); - case "double" - arrowArray = arrow.array.Float64Array(matlabArray); - case "uint8" - arrowArray = arrow.array.UInt8Array(matlabArray); - case "uint16" - arrowArray = arrow.array.UInt16Array(matlabArray); - case "uint32" - arrowArray = arrow.array.UInt32Array(matlabArray); - case "uint64" - arrowArray = arrow.array.UInt64Array(matlabArray); - case "int8" - arrowArray = arrow.array.Int8Array(matlabArray); - case "int16" - arrowArray = arrow.array.Int16Array(matlabArray); - case "int32" - arrowArray = arrow.array.Int32Array(matlabArray); - case "int64" - arrowArray = arrow.array.Int64Array(matlabArray); - case "logical" - arrowArray = arrow.array.BooleanArray(matlabArray); - case "string" - arrowArray = arrow.array.StringArray(matlabArray); - case "datetime" - arrowArray = arrow.array.TimestampArray(matlabArray); - otherwise - error("arrow:tabular:recordbatch:UnsupportedMatlabArrayType", ... - "RecordBatch cannot be constructed from a MATLAB array of type '" + class(matlabArray) + "'."); - end - - end - function proxyIDs = getArrowProxyIDs(arrowArrays) % Extract the Proxy IDs underlying a cell array of % arrow.array.Array instances. diff --git a/matlab/src/matlab/+arrow/array.m b/matlab/src/matlab/+arrow/array.m new file mode 100644 index 0000000000000..a2d0ecd2e8f76 --- /dev/null +++ b/matlab/src/matlab/+arrow/array.m @@ -0,0 +1,67 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +function arrowArray = array(data, opts) + arguments + data + opts.InferNulls(1, 1) logical = true + opts.Valid + end + + data = convertCellstrToString(data); + classname = string(class(data)); + args = namedargs2cell(opts); + + switch (classname) + case "logical" + arrowArray = arrow.array.BooleanArray.fromMATLAB(data, args{:}); + case "uint8" + arrowArray = arrow.array.UInt8Array.fromMATLAB(data, args{:}); + case "uint16" + arrowArray = arrow.array.UInt16Array.fromMATLAB(data, args{:}); + case "uint32" + arrowArray = arrow.array.UInt32Array.fromMATLAB(data, args{:}); + case "uint64" + arrowArray = arrow.array.UInt64Array.fromMATLAB(data, args{:}); + case "int8" + arrowArray = arrow.array.Int8Array.fromMATLAB(data, args{:}); + case "int16" + arrowArray = arrow.array.Int16Array.fromMATLAB(data, args{:}); + case "int32" + arrowArray = arrow.array.Int32Array.fromMATLAB(data, args{:}); + case "int64" + arrowArray = arrow.array.Int64Array.fromMATLAB(data, args{:}); + case "single" + arrowArray = arrow.array.Float32Array.fromMATLAB(data, args{:}); + case "double" + arrowArray = arrow.array.Float64Array.fromMATLAB(data, args{:}); + case "string" + arrowArray = arrow.array.StringArray.fromMATLAB(data, args{:}); + case "datetime" + arrowArray = arrow.array.TimestampArray.fromMATLAB(data, args{:}); + otherwise + errid = "arrow:array:UnsupportedMATLABType"; + msg = join(["Unable to convert MATLAB type" classname "to arrow array."]); + error(errid, msg); + end +end + +function data = convertCellstrToString(data) + % Support constructing a StringArray from a cell array of strings + % (i.e. cellstr), or a string array, but not a char array. + if ~ischar(data) + data = convertCharsToStrings(data); + end +end \ No newline at end of file diff --git a/matlab/test/arrow/array/hNumericArray.m b/matlab/test/arrow/array/hNumericArray.m index f9f5f1d9e4ee3..fc5f5a05998ed 100644 --- a/matlab/test/arrow/array/hNumericArray.m +++ b/matlab/test/arrow/array/hNumericArray.m @@ -18,7 +18,7 @@ properties (Abstract) ArrowArrayClassName(1, 1) string - ArrowArrayConstructor + ArrowArrayConstructorFcn MatlabArrayFcn MatlabConversionFcn MaxValue (1, 1) @@ -38,24 +38,24 @@ function verifyOnMatlabPath(tc) methods(Test) function BasicTest(tc) - A = tc.ArrowArrayConstructor(tc.MatlabArrayFcn([1 2 3])); + A = tc.ArrowArrayConstructorFcn(tc.MatlabArrayFcn([1 2 3])); className = string(class(A)); tc.verifyEqual(className, tc.ArrowArrayClassName); end function ToMATLAB(tc) % Create array from a scalar - A1 = tc.ArrowArrayConstructor(tc.MatlabArrayFcn(100)); + A1 = tc.ArrowArrayConstructorFcn(tc.MatlabArrayFcn(100)); data = toMATLAB(A1); tc.verifyEqual(data, tc.MatlabArrayFcn(100)); % Create array from a vector - A2 = tc.ArrowArrayConstructor(tc.MatlabArrayFcn([1 2 3])); + A2 = tc.ArrowArrayConstructorFcn(tc.MatlabArrayFcn([1 2 3])); data = toMATLAB(A2); tc.verifyEqual(data, tc.MatlabArrayFcn([1 2 3]')); % Create a Float64Array from an empty double vector - A3 = tc.ArrowArrayConstructor(tc.MatlabArrayFcn([])); + A3 = tc.ArrowArrayConstructorFcn(tc.MatlabArrayFcn([])); data = toMATLAB(A3); tc.verifyEqual(data, tc.MatlabArrayFcn(reshape([], 0, 1))); end @@ -65,54 +65,55 @@ function MatlabConversion(tc) % arrow.array.Float32Array, double for array.array.Float64Array % Create array from a scalar - A1 = tc.ArrowArrayConstructor(tc.MatlabArrayFcn(100)); + A1 = tc.ArrowArrayConstructorFcn(tc.MatlabArrayFcn(100)); data = tc.MatlabConversionFcn(A1); tc.verifyEqual(data, tc.MatlabArrayFcn(100)); % Create array from a vector - A2 = tc.ArrowArrayConstructor(tc.MatlabArrayFcn([1 2 3])); + A2 = tc.ArrowArrayConstructorFcn(tc.MatlabArrayFcn([1 2 3])); data = tc.MatlabConversionFcn(A2); tc.verifyEqual(data, tc.MatlabArrayFcn([1 2 3]')); % Create an array from an empty vector - A3 = tc.ArrowArrayConstructor(tc.MatlabArrayFcn([])); + A3 = tc.ArrowArrayConstructorFcn(tc.MatlabArrayFcn([])); data = tc.MatlabConversionFcn(A3); tc.verifyEqual(data, tc.MatlabArrayFcn(reshape([], 0, 1))); end function MinValueTest(tc) - A = tc.ArrowArrayConstructor(tc.MinValue); + A = tc.ArrowArrayConstructorFcn(tc.MinValue); tc.verifyEqual(toMATLAB(A), tc.MinValue); end function MaxValueTest(tc) - A1 = tc.ArrowArrayConstructor(tc.MaxValue); + A1 = tc.ArrowArrayConstructorFcn(tc.MaxValue); tc.verifyEqual(toMATLAB(A1), tc.MaxValue); end function ErrorIfComplex(tc) - fcn = @() tc.ArrowArrayConstructor(tc.MatlabArrayFcn([10 + 1i, 4])); - tc.verifyError(fcn, "MATLAB:expectedReal"); + fcn = @() tc.ArrowArrayConstructorFcn(tc.MatlabArrayFcn([10 + 1i, 4])); + tc.verifyError(fcn, "arrow:array:ComplexNumeric"); end function ErrorIfNonVector(tc) data = tc.MatlabArrayFcn([1 2 3 4 5 6 7 8 9]); data = reshape(data, 3, 1, 3); - fcn = @() tc.ArrowArrayConstructor(tc.MatlabArrayFcn(data)); - tc.verifyError(fcn, "MATLAB:expectedVector"); + fcn = @() tc.ArrowArrayConstructorFcn(tc.MatlabArrayFcn(data)); + tc.verifyError(fcn, "arrow:array:InvalidShape"); end - function ErrorIfEmptyArrayIsNotTwoDimensional(tc) + function AllowNDimensionalEmptyArray(tc) data = tc.MatlabArrayFcn(reshape([], [1 0 0])); - fcn = @() tc.ArrowArrayConstructor(data); - tc.verifyError(fcn, "MATLAB:expected2D"); + A = tc.ArrowArrayConstructorFcn(data); + tc.verifyEqual(A.Length, int64(0)); + tc.verifyEqual(toMATLAB(A), tc.MatlabArrayFcn(reshape([], [0 1]))); end function LogicalValidNVPair(tc) % Verify the expected elements are treated as null when Valid % is provided as a logical array data = tc.MatlabArrayFcn([1 2 3 4]); - arrowArray = tc.ArrowArrayConstructor(data, Valid=[false true true false]); + arrowArray = tc.ArrowArrayConstructorFcn(data, Valid=[false true true false]); expectedData = data'; expectedData([1 4]) = tc.NullSubstitutionValue; @@ -125,7 +126,7 @@ function NumericValidNVPair(tc) % Verify the expected elements are treated as null when Valid % is provided as a array of indices data = tc.MatlabArrayFcn([1 2 3 4]); - arrowArray = tc.ArrowArrayConstructor(data, Valid=[2 4]); + arrowArray = tc.ArrowArrayConstructorFcn(data, Valid=[2 4]); expectedData = data'; expectedData([1 3]) = tc.NullSubstitutionValue; @@ -136,7 +137,7 @@ function NumericValidNVPair(tc) % Make sure the optimization where the valid-bitmap is stored % as a nullptr works as expected. expectedData = data'; - arrowArray = tc.ArrowArrayConstructor(data, Valid=[1, 2, 3, 4]); + arrowArray = tc.ArrowArrayConstructorFcn(data, Valid=[1, 2, 3, 4]); tc.verifyEqual(tc.MatlabConversionFcn(arrowArray), expectedData); tc.verifyEqual(toMATLAB(arrowArray), expectedData); tc.verifyEqual(arrowArray.Valid, [true; true; true; true]); @@ -145,7 +146,7 @@ function NumericValidNVPair(tc) function TestArrowType(tc) % Verify the array has the expected arrow.type.Type object data = tc.MatlabArrayFcn([1 2 3 4]); - arrowArray = tc.ArrowArrayConstructor(data); + arrowArray = tc.ArrowArrayConstructorFcn(data); tc.verifyEqual(arrowArray.Type.ID, tc.ArrowType.ID); end end diff --git a/matlab/test/arrow/array/tArray.m b/matlab/test/arrow/array/tArray.m new file mode 100644 index 0000000000000..4a476696bfacf --- /dev/null +++ b/matlab/test/arrow/array/tArray.m @@ -0,0 +1,88 @@ +%TARRAY Unit tests for arrow.array function. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef tArray < matlab.unittest.TestCase + + properties(TestParameter) + MATLABDataArrayTypePair = { ... + {[true false], "arrow.array.BooleanArray"}, ... + {int8([1 2]), "arrow.array.Int8Array"}, ... + {uint8([1 2]), "arrow.array.UInt8Array"}, ... + {int16([1 2]), "arrow.array.Int16Array"}, ... + {uint16([1 2]), "arrow.array.UInt16Array"}, ... + {int32([1 2]), "arrow.array.Int32Array"}, ... + {uint32([1 2]), "arrow.array.UInt32Array"}, ... + {int64([1 2]), "arrow.array.Int64Array"}, ... + {uint64([1 2]), "arrow.array.UInt64Array"}, ... + {single([1 2]), "arrow.array.Float32Array"}, ... + {[1 2], "arrow.array.Float64Array"}, ... + {datetime(2022, 1, 1), "arrow.array.TimestampArray"}, ... + {["A" "B"], "arrow.array.StringArray"}}; + end + + methods(Test) + function ArrowArrayOutputType(testCase, MATLABDataArrayTypePair) + % Verify arrow.array returns the expected arrow.array.Array + % with respect to the input data array's MATLAB class type. + matlabArray = MATLABDataArrayTypePair{1}; + expectedClassName = MATLABDataArrayTypePair{2}; + arrowArray = arrow.array(matlabArray); + actualClassName = string(class(arrowArray)); + testCase.verifyEqual(actualClassName, expectedClassName); + end + + function UnsupportedMATLABTypeError(testCase) + % Verify arrow.array throws an error with the identifier + % "arrow:array:UnsupportedMATLABType" if the input array is not one + % we support converting into an Arrow array. + matlabArray = table; + fcn = @() arrow.array(matlabArray); + errID = "arrow:array:UnsupportedMATLABType"; + testCase.verifyError(fcn, errID); + end + + function InferNullsDefault(testCase) + % Verify InferNulls is true by default. + matlabArray = [1 2 NaN 3]; + arrowArray = arrow.array(matlabArray); + testCase.verifyEqual(arrowArray.Valid, [true; true; false; true]); + end + + function InferNullsTrue(testCase) + % Verify InferNulls is true by default. + matlabArray = [1 2 NaN 3]; + arrowArray = arrow.array(matlabArray, InferNulls=true); + testCase.verifyEqual(arrowArray.Valid, [true; true; false; true]); + end + + function InferNullsFalse(testCase) + % Verify Valid is the expected logical vector when + % InferNulls=false. + matlabArray = [1 2 NaN 3]; + arrowArray = arrow.array(matlabArray, InferNulls=false); + testCase.verifyEqual(arrowArray.Valid, [true; true; true; true]); + end + + function ValidNameValuePair(testCase) + % Verify Valid is the expected vector when the Valid + % name-value pair is supplied. + matlabArray = [1 NaN NaN 3]; + arrowArray = arrow.array(matlabArray, Valid=[1 2]); + testCase.verifyEqual(arrowArray.Valid, [true; true; false; false]); + end + end +end \ No newline at end of file diff --git a/matlab/test/arrow/array/tBooleanArray.m b/matlab/test/arrow/array/tBooleanArray.m index e27ca11285a50..df7f052597075 100644 --- a/matlab/test/arrow/array/tBooleanArray.m +++ b/matlab/test/arrow/array/tBooleanArray.m @@ -18,7 +18,7 @@ properties ArrowArrayClassName(1, 1) string = "arrow.array.BooleanArray" - ArrowArrayConstructor = @arrow.array.BooleanArray + ArrowArrayConstructorFcn = @arrow.array.BooleanArray.fromMATLAB MatlabArrayFcn = @logical MatlabConversionFcn = @logical NullSubstitutionValue = false @@ -36,34 +36,34 @@ function verifyOnMatlabPath(tc) methods(Test) function BasicTest(tc) - A = tc.ArrowArrayConstructor(tc.MatlabArrayFcn([true false true])); + A = tc.ArrowArrayConstructorFcn(tc.MatlabArrayFcn([true false true])); className = string(class(A)); tc.verifyEqual(className, tc.ArrowArrayClassName); end function ToMATLAB(tc) % Create array from a scalar - A1 = tc.ArrowArrayConstructor(tc.MatlabArrayFcn(true)); + A1 = tc.ArrowArrayConstructorFcn(tc.MatlabArrayFcn(true)); data = toMATLAB(A1); tc.verifyEqual(data, tc.MatlabArrayFcn(true)); % Create array from a vector - A2 = tc.ArrowArrayConstructor(tc.MatlabArrayFcn([true false true])); + A2 = tc.ArrowArrayConstructorFcn(tc.MatlabArrayFcn([true false true])); data = toMATLAB(A2); tc.verifyEqual(data, tc.MatlabArrayFcn([true false true]')); % Create a BooleanArray from an empty 0x0 logical vector - A3 = tc.ArrowArrayConstructor(tc.MatlabArrayFcn(logical.empty(0, 0))); + A3 = tc.ArrowArrayConstructorFcn(tc.MatlabArrayFcn(logical.empty(0, 0))); data = toMATLAB(A3); tc.verifyEqual(data, tc.MatlabArrayFcn(reshape([], 0, 1))); % Create a BooleanArray from an empty 0x1 logical vector - A4= tc.ArrowArrayConstructor(tc.MatlabArrayFcn(logical.empty(0, 1))); + A4= tc.ArrowArrayConstructorFcn(tc.MatlabArrayFcn(logical.empty(0, 1))); data = toMATLAB(A4); tc.verifyEqual(data, tc.MatlabArrayFcn(reshape([], 0, 1))); % Create a BooleanArray from an empty 1x0 logical vector - A5= tc.ArrowArrayConstructor(tc.MatlabArrayFcn(logical.empty(0, 1))); + A5= tc.ArrowArrayConstructorFcn(tc.MatlabArrayFcn(logical.empty(0, 1))); data = toMATLAB(A5); tc.verifyEqual(data, tc.MatlabArrayFcn(reshape([], 0, 1))); end @@ -72,27 +72,27 @@ function MatlabConversion(tc) % Tests the type-specific conversion method (i.e. logical) % Create array from a scalar - A1 = tc.ArrowArrayConstructor(tc.MatlabArrayFcn(true)); + A1 = tc.ArrowArrayConstructorFcn(tc.MatlabArrayFcn(true)); data = tc.MatlabConversionFcn(A1); tc.verifyEqual(data, tc.MatlabArrayFcn(true)); % Create array from a vector - A2 = tc.ArrowArrayConstructor(tc.MatlabArrayFcn([true false true])); + A2 = tc.ArrowArrayConstructorFcn(tc.MatlabArrayFcn([true false true])); data = tc.MatlabConversionFcn(A2); tc.verifyEqual(data, tc.MatlabArrayFcn([true false true]')); % Create a BooleanArray from an empty 0x0 logical vector - A3 = tc.ArrowArrayConstructor(tc.MatlabArrayFcn(logical.empty(0, 0))); + A3 = tc.ArrowArrayConstructorFcn(tc.MatlabArrayFcn(logical.empty(0, 0))); data = tc.MatlabConversionFcn(A3); tc.verifyEqual(data, tc.MatlabArrayFcn(reshape([], 0, 1))); % Create a BooleanArray from an empty 0x1 logical vector - A4= tc.ArrowArrayConstructor(tc.MatlabArrayFcn(logical.empty(0, 1))); + A4= tc.ArrowArrayConstructorFcn(tc.MatlabArrayFcn(logical.empty(0, 1))); data = tc.MatlabConversionFcn(A4); tc.verifyEqual(data, tc.MatlabArrayFcn(reshape([], 0, 1))); % Create a BooleanArray from an empty 1x0 logical vector - A5= tc.ArrowArrayConstructor(tc.MatlabArrayFcn(logical.empty(0, 1))); + A5= tc.ArrowArrayConstructorFcn(tc.MatlabArrayFcn(logical.empty(0, 1))); data = tc.MatlabConversionFcn(A5); tc.verifyEqual(data, tc.MatlabArrayFcn(reshape([], 0, 1))); end @@ -101,7 +101,7 @@ function LogicalValidNVPair(tc) % Verify the expected elements are treated as null when Valid % is provided as a logical array data = tc.MatlabArrayFcn([true false true]'); - arrowArray = tc.ArrowArrayConstructor(data, Valid=[false true true]); + arrowArray = tc.ArrowArrayConstructorFcn(data, Valid=[false true true]); expectedData = data; expectedData(1) = tc.NullSubstitutionValue; @@ -114,7 +114,7 @@ function NumericValidNVPair(tc) % Verify the expected elements are treated as null when Valid % is provided as a array of indices data = tc.MatlabArrayFcn([true false true]'); - arrowArray = tc.ArrowArrayConstructor(data, Valid=[1, 2]); + arrowArray = tc.ArrowArrayConstructorFcn(data, Valid=[1, 2]); expectedData = data; expectedData(3) = tc.NullSubstitutionValue; @@ -126,7 +126,7 @@ function NumericValidNVPair(tc) % Make sure the optimization where the valid-bitmap is stored as % a nullptr works as expected. expectedData = data; - arrowArray = tc.ArrowArrayConstructor(data, Valid=[1, 2, 3]); + arrowArray = tc.ArrowArrayConstructorFcn(data, Valid=[1, 2, 3]); tc.verifyEqual(tc.MatlabConversionFcn(arrowArray), expectedData); tc.verifyEqual(toMATLAB(arrowArray), expectedData); tc.verifyEqual(arrowArray.Valid, [true; true; true]); @@ -135,26 +135,27 @@ function NumericValidNVPair(tc) function ErrorIfNonVector(tc) data = tc.MatlabArrayFcn([true false true false true false true false true]); data = reshape(data, 3, 1, 3); - fcn = @() tc.ArrowArrayConstructor(tc.MatlabArrayFcn(data)); - tc.verifyError(fcn, "MATLAB:expectedVector"); + fcn = @() tc.ArrowArrayConstructorFcn(tc.MatlabArrayFcn(data)); + tc.verifyError(fcn, "arrow:array:InvalidShape"); end - function ErrorIfEmptyArrayIsNotTwoDimensional(tc) - data = tc.MatlabArrayFcn(reshape(logical.empty(0, 0), [1 0 0])); - fcn = @() tc.ArrowArrayConstructor(data); - tc.verifyError(fcn, "MATLAB:expected2D"); + function AllowNDimensionalEmptyArray(tc) + data = tc.MatlabArrayFcn(reshape([], [1 0 0])); + A = tc.ArrowArrayConstructorFcn(data); + tc.verifyEqual(A.Length, int64(0)); + tc.verifyEqual(toMATLAB(A), tc.MatlabArrayFcn(reshape([], [0 1]))); end function ErrorIfSparseArray(tc) data = tc.MatlabArrayFcn(sparse([true false true])); - fcn = @() tc.ArrowArrayConstructor(data); - tc.verifyError(fcn, "MATLAB:expectedNonsparse"); + fcn = @() tc.ArrowArrayConstructorFcn(data); + tc.verifyError(fcn, "arrow:array:Sparse"); end function TestArrowType(tc) % Verify the array has the expected arrow.type.Type object data = tc.MatlabArrayFcn([true false]); - arrowArray = tc.ArrowArrayConstructor(data); + arrowArray = tc.ArrowArrayConstructorFcn(data); tc.verifyEqual(arrowArray.Type.ID, tc.ArrowType.ID); end end diff --git a/matlab/test/arrow/array/tFloat32Array.m b/matlab/test/arrow/array/tFloat32Array.m index f007e2b422d6e..2ec9b90b4d277 100644 --- a/matlab/test/arrow/array/tFloat32Array.m +++ b/matlab/test/arrow/array/tFloat32Array.m @@ -18,7 +18,7 @@ properties ArrowArrayClassName = "arrow.array.Float32Array" - ArrowArrayConstructor = @arrow.array.Float32Array + ArrowArrayConstructorFcn = @arrow.array.Float32Array.fromMATLAB MatlabConversionFcn = @single % single method on class MatlabArrayFcn = @single % single function MaxValue = realmax("single") @@ -29,7 +29,7 @@ methods(Test) function InfValues(testCase) - A1 = arrow.array.Float32Array(single([Inf -Inf])); + A1 = testCase.ArrowArrayConstructorFcn(single([Inf -Inf])); data = single(A1); testCase.verifyEqual(data, single([Inf -Inf]')); end @@ -38,7 +38,7 @@ function ValidBasic(testCase) % Create a MATLAB array with one null value (i.e. one NaN). % Verify NaN is considered a null value by default. matlabArray = single([1, NaN, 3]'); - arrowArray = arrow.array.Float32Array(matlabArray); + arrowArray = testCase.ArrowArrayConstructorFcn(matlabArray); expectedValid = [true, false, true]'; testCase.verifyEqual(arrowArray.Valid, expectedValid); end @@ -47,13 +47,13 @@ function InferNulls(testCase) matlabArray = single([1, NaN, 3]); % Verify NaN is treated as a null value when InferNulls=true. - arrowArray1 = arrow.array.Float32Array(matlabArray, InferNulls=true); + arrowArray1 = testCase.ArrowArrayConstructorFcn(matlabArray, InferNulls=true); expectedValid1 = [true false true]'; testCase.verifyEqual(arrowArray1.Valid, expectedValid1); testCase.verifyEqual(toMATLAB(arrowArray1), matlabArray'); % Verify NaN is not treated as a null value when InferNulls=false. - arrowArray2 = arrow.array.Float32Array(matlabArray, InferNulls=false); + arrowArray2 = testCase.ArrowArrayConstructorFcn(matlabArray, InferNulls=false); expectedValid2 = [true true true]'; testCase.verifyEqual(arrowArray2.Valid, expectedValid2); testCase.verifyEqual(toMATLAB(arrowArray2), matlabArray'); @@ -62,7 +62,7 @@ function InferNulls(testCase) function ValidNoNulls(testCase) % Create a MATLAB array with no null values (i.e. no NaNs). matlabArray = single([1, 2, 3]'); - arrowArray = arrow.array.Float32Array(matlabArray); + arrowArray = testCase.ArrowArrayConstructorFcn(matlabArray); expectedValid = [true, true, true]'; testCase.verifyEqual(arrowArray.Valid, expectedValid); end @@ -70,7 +70,7 @@ function ValidNoNulls(testCase) function ValidAllNulls(testCase) % Create a MATLAB array with all null values (i.e. all NaNs). matlabArray = single([NaN, NaN, NaN]'); - arrowArray = arrow.array.Float32Array(matlabArray); + arrowArray = testCase.ArrowArrayConstructorFcn(matlabArray); expectedValid = [false, false, false]'; testCase.verifyEqual(arrowArray.Valid, expectedValid); end @@ -78,18 +78,18 @@ function ValidAllNulls(testCase) function EmptyArrayValidBitmap(testCase) % Create an empty 0x0 MATLAB array. matlabArray = single.empty(0, 0); - arrowArray = arrow.array.Float32Array(matlabArray); + arrowArray = testCase.ArrowArrayConstructorFcn(matlabArray); expectedValid = logical.empty(0, 1); testCase.verifyEqual(arrowArray.Valid, expectedValid); % Create an empty 0x1 MATLAB array. matlabArray = single.empty(0, 1); - arrowArray = arrow.array.Float32Array(matlabArray); + arrowArray = testCase.ArrowArrayConstructorFcn(matlabArray); testCase.verifyEqual(arrowArray.Valid, expectedValid); % Create an empty 1x0 MATLAB array. matlabArray = single.empty(1, 0); - arrowArray = arrow.array.Float32Array(matlabArray); + arrowArray = testCase.ArrowArrayConstructorFcn(matlabArray); testCase.verifyEqual(arrowArray.Valid, expectedValid); end @@ -97,7 +97,7 @@ function LogicalValidNVPair(testCase) matlabArray = single([1 2 3]); % Supply a logical vector for Valid - arrowArray = arrow.array.Float32Array(matlabArray, Valid=[false; true; true]); + arrowArray = testCase.ArrowArrayConstructorFcn(matlabArray, Valid=[false; true; true]); testCase.verifyEqual(arrowArray.Valid, [false; true; true]); testCase.verifyEqual(toMATLAB(arrowArray), single([NaN; 2; 3])); end @@ -106,7 +106,7 @@ function NumericlValidNVPair(testCase) matlabArray = single([1 2 3]); % Supply a numeric vector for Valid - arrowArray = arrow.array.Float32Array(matlabArray, Valid=[1 3]); + arrowArray = testCase.ArrowArrayConstructorFcn(matlabArray, Valid=[1 3]); testCase.verifyEqual(arrowArray.Valid, [true; false; true]); testCase.verifyEqual(toMATLAB(arrowArray), single([1; NaN; 3])); end diff --git a/matlab/test/arrow/array/tFloat64Array.m b/matlab/test/arrow/array/tFloat64Array.m index 9b30ec8f25d49..a39a3d1e62cb4 100755 --- a/matlab/test/arrow/array/tFloat64Array.m +++ b/matlab/test/arrow/array/tFloat64Array.m @@ -18,7 +18,7 @@ properties ArrowArrayClassName = "arrow.array.Float64Array" - ArrowArrayConstructor = @arrow.array.Float64Array + ArrowArrayConstructorFcn = @arrow.array.Float64Array.fromMATLAB MatlabConversionFcn = @double % double method on class MatlabArrayFcn = @double % double function MaxValue = realmax("double") @@ -29,21 +29,21 @@ methods(Test) function InfValues(testCase) - A1 = arrow.array.Float64Array([Inf -Inf]); + A1 = testCase.ArrowArrayConstructorFcn([Inf -Inf]); data = double(A1); testCase.verifyEqual(data, [Inf -Inf]'); end function ErrorIfSparse(testCase) - fcn = @() arrow.array.Float64Array(sparse(ones([10 1]))); - testCase.verifyError(fcn, "MATLAB:expectedNonsparse"); + fcn = @() testCase.ArrowArrayConstructorFcn(sparse(ones([10 1]))); + testCase.verifyError(fcn, "arrow:array:Sparse"); end function ValidBasic(testCase) % Create a MATLAB array with one null value (i.e. one NaN). % Verify NaN is considered a null value by default. matlabArray = [1, NaN, 3]'; - arrowArray = arrow.array.Float64Array(matlabArray); + arrowArray = testCase.ArrowArrayConstructorFcn(matlabArray); expectedValid = [true, false, true]'; testCase.verifyEqual(arrowArray.Valid, expectedValid); end @@ -52,13 +52,13 @@ function InferNulls(testCase) matlabArray = [1, NaN, 3]; % Verify NaN is treated as a null value when InferNulls=true. - arrowArray1 = arrow.array.Float64Array(matlabArray, InferNulls=true); + arrowArray1 = testCase.ArrowArrayConstructorFcn(matlabArray, InferNulls=true); expectedValid1 = [true false true]'; testCase.verifyEqual(arrowArray1.Valid, expectedValid1); testCase.verifyEqual(toMATLAB(arrowArray1), matlabArray'); % Verify NaN is not treated as a null value when InferNulls=false. - arrowArray2 = arrow.array.Float64Array(matlabArray, InferNulls=false); + arrowArray2 = testCase.ArrowArrayConstructorFcn(matlabArray, InferNulls=false); expectedValid2 = [true true true]'; testCase.verifyEqual(arrowArray2.Valid, expectedValid2); testCase.verifyEqual(toMATLAB(arrowArray2), matlabArray'); @@ -67,7 +67,7 @@ function InferNulls(testCase) function ValidNoNulls(testCase) % Create a MATLAB array with no null values (i.e. no NaNs). matlabArray = [1, 2, 3]'; - arrowArray = arrow.array.Float64Array(matlabArray); + arrowArray = testCase.ArrowArrayConstructorFcn(matlabArray); expectedValid = [true, true, true]'; testCase.verifyEqual(arrowArray.Valid, expectedValid); end @@ -75,7 +75,7 @@ function ValidNoNulls(testCase) function ValidAllNulls(testCase) % Create a MATLAB array with all null values (i.e. all NaNs). matlabArray = [NaN, NaN, NaN]'; - arrowArray = arrow.array.Float64Array(matlabArray); + arrowArray = testCase.ArrowArrayConstructorFcn(matlabArray); expectedValid = [false, false, false]'; testCase.verifyEqual(arrowArray.Valid, expectedValid); end @@ -83,18 +83,18 @@ function ValidAllNulls(testCase) function EmptyArrayValidBitmap(testCase) % Create an empty 0x0 MATLAB array. matlabArray = double.empty(0, 0); - arrowArray = arrow.array.Float64Array(matlabArray); + arrowArray = testCase.ArrowArrayConstructorFcn(matlabArray); expectedValid = logical.empty(0, 1); testCase.verifyEqual(arrowArray.Valid, expectedValid); % Create an empty 0x1 MATLAB array. matlabArray = double.empty(0, 1); - arrowArray = arrow.array.Float64Array(matlabArray); + arrowArray = testCase.ArrowArrayConstructorFcn(matlabArray); testCase.verifyEqual(arrowArray.Valid, expectedValid); % Create an empty 1x0 MATLAB array. matlabArray = double.empty(1, 0); - arrowArray = arrow.array.Float64Array(matlabArray); + arrowArray = testCase.ArrowArrayConstructorFcn(matlabArray); testCase.verifyEqual(arrowArray.Valid, expectedValid); end @@ -102,7 +102,7 @@ function LogicalValidNVPair(testCase) matlabArray = [1 2 3]; % Supply a logical vector for Valid - arrowArray = arrow.array.Float64Array(matlabArray, Valid=[false; true; true]); + arrowArray = testCase.ArrowArrayConstructorFcn(matlabArray, Valid=[false; true; true]); testCase.verifyEqual(arrowArray.Valid, [false; true; true]); testCase.verifyEqual(toMATLAB(arrowArray), [NaN; 2; 3]); end @@ -111,7 +111,7 @@ function NumericlValidNVPair(testCase) matlabArray = [1 2 3]; % Supply a numeric vector for Valid - arrowArray = arrow.array.Float64Array(matlabArray, Valid=[1 3]); + arrowArray = testCase.ArrowArrayConstructorFcn(matlabArray, Valid=[1 3]); testCase.verifyEqual(arrowArray.Valid, [true; false; true]); testCase.verifyEqual(toMATLAB(arrowArray), [1; NaN; 3]); end diff --git a/matlab/test/arrow/array/tInt16Array.m b/matlab/test/arrow/array/tInt16Array.m index 9cb5fdc1d1049..391c8ccb6243b 100644 --- a/matlab/test/arrow/array/tInt16Array.m +++ b/matlab/test/arrow/array/tInt16Array.m @@ -18,7 +18,7 @@ properties ArrowArrayClassName = "arrow.array.Int16Array" - ArrowArrayConstructor = @arrow.array.Int16Array + ArrowArrayConstructorFcn = @arrow.array.Int16Array.fromMATLAB MatlabConversionFcn = @int16 % int16 method on class MatlabArrayFcn = @int16 % int16 function MaxValue = intmax("int16") diff --git a/matlab/test/arrow/array/tInt32Array.m b/matlab/test/arrow/array/tInt32Array.m index b45705592d714..accf70735915f 100644 --- a/matlab/test/arrow/array/tInt32Array.m +++ b/matlab/test/arrow/array/tInt32Array.m @@ -18,7 +18,7 @@ properties ArrowArrayClassName = "arrow.array.Int32Array" - ArrowArrayConstructor = @arrow.array.Int32Array + ArrowArrayConstructorFcn = @arrow.array.Int32Array.fromMATLAB MatlabConversionFcn = @int32 % int32 method on class MatlabArrayFcn = @int32 % int32 function MaxValue = intmax("int32") diff --git a/matlab/test/arrow/array/tInt64Array.m b/matlab/test/arrow/array/tInt64Array.m index 0b38f58547cce..909e9af866e0f 100644 --- a/matlab/test/arrow/array/tInt64Array.m +++ b/matlab/test/arrow/array/tInt64Array.m @@ -18,7 +18,7 @@ properties ArrowArrayClassName = "arrow.array.Int64Array" - ArrowArrayConstructor = @arrow.array.Int64Array + ArrowArrayConstructorFcn = @arrow.array.Int64Array.fromMATLAB MatlabConversionFcn = @int64 % int64 method on class MatlabArrayFcn = @int64 % int64 function MaxValue = intmax("int64") diff --git a/matlab/test/arrow/array/tInt8Array.m b/matlab/test/arrow/array/tInt8Array.m index 8ce8e4e9b2d14..2817b7b61f31e 100644 --- a/matlab/test/arrow/array/tInt8Array.m +++ b/matlab/test/arrow/array/tInt8Array.m @@ -18,7 +18,7 @@ properties ArrowArrayClassName = "arrow.array.Int8Array" - ArrowArrayConstructor = @arrow.array.Int8Array + ArrowArrayConstructorFcn = @arrow.array.Int8Array.fromMATLAB MatlabConversionFcn = @int8 % int8 method on class MatlabArrayFcn = @int8 % int8 function MaxValue = intmax("int8") diff --git a/matlab/test/arrow/array/tStringArray.m b/matlab/test/arrow/array/tStringArray.m index dbb2adca0ce5b..85628ce4216dd 100644 --- a/matlab/test/arrow/array/tStringArray.m +++ b/matlab/test/arrow/array/tStringArray.m @@ -18,7 +18,7 @@ properties ArrowArrayClassName(1, 1) string = "arrow.array.StringArray" - ArrowArrayConstructor = @arrow.array.StringArray + ArrowArrayConstructorFcn = @arrow.array.StringArray.fromMATLAB MatlabArrayFcn = @string MatlabConversionFcn = @string NullSubstitutionValue = string(missing) @@ -36,34 +36,34 @@ function verifyOnMatlabPath(tc) methods(Test) function BasicTest(tc) - A = tc.ArrowArrayConstructor(tc.MatlabArrayFcn(["A", "B", "C"])); + A = tc.ArrowArrayConstructorFcn(tc.MatlabArrayFcn(["A", "B", "C"])); className = string(class(A)); tc.verifyEqual(className, tc.ArrowArrayClassName); end function ToMATLAB(tc) % Create array from a scalar - A1 = tc.ArrowArrayConstructor(tc.MatlabArrayFcn("A")); + A1 = tc.ArrowArrayConstructorFcn(tc.MatlabArrayFcn("A")); data = toMATLAB(A1); tc.verifyEqual(data, tc.MatlabArrayFcn("A")); % Create array from a vector - A2 = tc.ArrowArrayConstructor(tc.MatlabArrayFcn(["A", "B", "C"])); + A2 = tc.ArrowArrayConstructorFcn(tc.MatlabArrayFcn(["A", "B", "C"])); data = toMATLAB(A2); tc.verifyEqual(data, tc.MatlabArrayFcn(["A", "B", "C"]')); % Create a StringArray from an empty 0x0 string vector - A3 = tc.ArrowArrayConstructor(tc.MatlabArrayFcn(string.empty(0, 0))); + A3 = tc.ArrowArrayConstructorFcn(tc.MatlabArrayFcn(string.empty(0, 0))); data = toMATLAB(A3); tc.verifyEqual(data, tc.MatlabArrayFcn(reshape([], 0, 1))); % Create a StringArray from an empty 0x1 string vector - A4= tc.ArrowArrayConstructor(tc.MatlabArrayFcn(string.empty(0, 1))); + A4= tc.ArrowArrayConstructorFcn(tc.MatlabArrayFcn(string.empty(0, 1))); data = toMATLAB(A4); tc.verifyEqual(data, tc.MatlabArrayFcn(reshape([], 0, 1))); % Create a StringArray from an empty 1x0 string vector - A5= tc.ArrowArrayConstructor(tc.MatlabArrayFcn(string.empty(0, 1))); + A5= tc.ArrowArrayConstructorFcn(tc.MatlabArrayFcn(string.empty(0, 1))); data = toMATLAB(A5); tc.verifyEqual(data, tc.MatlabArrayFcn(reshape([], 0, 1))); end @@ -72,27 +72,27 @@ function MatlabConversion(tc) % Tests the type-specific conversion method (i.e. string) % Create array from a scalar - A1 = tc.ArrowArrayConstructor(tc.MatlabArrayFcn("A")); + A1 = tc.ArrowArrayConstructorFcn(tc.MatlabArrayFcn("A")); data = tc.MatlabConversionFcn(A1); tc.verifyEqual(data, tc.MatlabArrayFcn("A")); % Create array from a vector - A2 = tc.ArrowArrayConstructor(tc.MatlabArrayFcn(["A", "B", "C"])); + A2 = tc.ArrowArrayConstructorFcn(tc.MatlabArrayFcn(["A", "B", "C"])); data = tc.MatlabConversionFcn(A2); tc.verifyEqual(data, tc.MatlabArrayFcn(["A", "B", "C"]')); % Create a StringArray from an empty 0x0 string vector - A3 = tc.ArrowArrayConstructor(tc.MatlabArrayFcn(string.empty(0, 0))); + A3 = tc.ArrowArrayConstructorFcn(tc.MatlabArrayFcn(string.empty(0, 0))); data = tc.MatlabConversionFcn(A3); tc.verifyEqual(data, tc.MatlabArrayFcn(reshape([], 0, 1))); % Create a StringArray from an empty 0x1 string vector - A4= tc.ArrowArrayConstructor(tc.MatlabArrayFcn(string.empty(0, 1))); + A4= tc.ArrowArrayConstructorFcn(tc.MatlabArrayFcn(string.empty(0, 1))); data = tc.MatlabConversionFcn(A4); tc.verifyEqual(data, tc.MatlabArrayFcn(reshape([], 0, 1))); % Create a StringArray from an empty 1x0 string vector - A5= tc.ArrowArrayConstructor(tc.MatlabArrayFcn(string.empty(0, 1))); + A5= tc.ArrowArrayConstructorFcn(tc.MatlabArrayFcn(string.empty(0, 1))); data = tc.MatlabConversionFcn(A5); tc.verifyEqual(data, tc.MatlabArrayFcn(reshape([], 0, 1))); end @@ -101,7 +101,7 @@ function LogicalValidNVPair(tc) % Verify the expected elements are treated as null when Valid % is provided as a logical array data = tc.MatlabArrayFcn(["A", "B", "C"]'); - arrowArray = tc.ArrowArrayConstructor(data, Valid=[false true true]); + arrowArray = tc.ArrowArrayConstructorFcn(data, Valid=[false true true]); expectedData = data; expectedData(1) = tc.NullSubstitutionValue; @@ -114,7 +114,7 @@ function NumericValidNVPair(tc) % Verify the expected elements are treated as null when Valid % is provided as a array of indices data = tc.MatlabArrayFcn(["A", "B", "C"]'); - arrowArray = tc.ArrowArrayConstructor(data, Valid=[1, 2]); + arrowArray = tc.ArrowArrayConstructorFcn(data, Valid=[1, 2]); expectedData = data; expectedData(3) = tc.NullSubstitutionValue; @@ -126,7 +126,7 @@ function NumericValidNVPair(tc) % Make sure the optimization where the valid-bitmap is stored as % a nullptr works as expected. expectedData = data; - arrowArray = tc.ArrowArrayConstructor(data, Valid=[1, 2, 3]); + arrowArray = tc.ArrowArrayConstructorFcn(data, Valid=[1, 2, 3]); tc.verifyEqual(tc.MatlabConversionFcn(arrowArray), expectedData); tc.verifyEqual(toMATLAB(arrowArray), expectedData); tc.verifyEqual(arrowArray.Valid, [true; true; true]); @@ -135,20 +135,21 @@ function NumericValidNVPair(tc) function ErrorIfNonVector(tc) data = tc.MatlabArrayFcn(["A", "B", "A", "B", "A", "B", "A", "B", "A"]); data = reshape(data, 3, 1, 3); - fcn = @() tc.ArrowArrayConstructor(tc.MatlabArrayFcn(data)); - tc.verifyError(fcn, "MATLAB:expectedVector"); + fcn = @() tc.ArrowArrayConstructorFcn(tc.MatlabArrayFcn(data)); + tc.verifyError(fcn, "arrow:array:InvalidShape"); end - function ErrorIfEmptyArrayIsNotTwoDimensional(tc) + function AllowNDimensionalEmptyArray(tc) data = tc.MatlabArrayFcn(reshape(string.empty(0, 0), [1 0 0])); - fcn = @() tc.ArrowArrayConstructor(data); - tc.verifyError(fcn, "MATLAB:expected2D"); + arrowArray = tc.ArrowArrayConstructorFcn(data); + tc.verifyEqual(arrowArray.Length, int64(0)); + tc.verifyEqual(toMATLAB(arrowArray), string.empty(0, 1)); end function TestArrowType(tc) % Verify the array has the expected arrow.type.Type object data = tc.MatlabArrayFcn(["A", "B"]); - arrowArray = tc.ArrowArrayConstructor(data); + arrowArray = tc.ArrowArrayConstructorFcn(data); tc.verifyEqual(arrowArray.Type.ID, tc.ArrowType.ID); end @@ -160,7 +161,7 @@ function Unicode(tc) mango = "🥭"; matlabArray = tc.MatlabArrayFcn([smiley; tree; mango]); - arrowArray = tc.ArrowArrayConstructor(matlabArray); + arrowArray = tc.ArrowArrayConstructorFcn(matlabArray); matlabArrayConverted = toMATLAB(arrowArray); tc.verifyEqual(matlabArrayConverted, matlabArray); end @@ -169,7 +170,7 @@ function Missing(tc) % Verify that string(missing) values get mapped to the empty % string value when InferNulls=false. matlabArray = tc.MatlabArrayFcn(["A"; string(missing); string(missing)]); - arrowArray = tc.ArrowArrayConstructor(matlabArray, InferNulls=false); + arrowArray = tc.ArrowArrayConstructorFcn(matlabArray, InferNulls=false); matlabArrayConverted = toMATLAB(arrowArray); tc.verifyEqual(matlabArrayConverted, ["A"; ""; ""]); end @@ -180,25 +181,25 @@ function CellStr(tc) % Row vector matlabArray = {'A', 'B', 'C'}; - arrowArray = tc.ArrowArrayConstructor(matlabArray); + arrowArray = tc.ArrowArrayConstructorFcn(matlabArray); matlabArrayConverted = toMATLAB(arrowArray); tc.verifyEqual(matlabArrayConverted, string(matlabArray')); % Column vector matlabArray = {'A'; 'B'; 'C'}; - arrowArray = tc.ArrowArrayConstructor(matlabArray); + arrowArray = tc.ArrowArrayConstructorFcn(matlabArray); matlabArrayConverted = toMATLAB(arrowArray); tc.verifyEqual(matlabArrayConverted, string(matlabArray)); % One element cellstr matlabArray = {''}; - arrowArray = tc.ArrowArrayConstructor(matlabArray); + arrowArray = tc.ArrowArrayConstructorFcn(matlabArray); matlabArrayConverted = toMATLAB(arrowArray); tc.verifyEqual(matlabArrayConverted, string(matlabArray)); % Empty cell matlabArray = {}; - arrowArray = tc.ArrowArrayConstructor(matlabArray); + arrowArray = tc.ArrowArrayConstructorFcn(matlabArray); matlabArrayConverted = toMATLAB(arrowArray); tc.verifyEqual(matlabArrayConverted, string.empty(0, 1)); end @@ -209,23 +210,23 @@ function ErrorIfChar(tc) % Row vector matlabArray = 'abc'; - tc.verifyError(@() tc.ArrowArrayConstructor(matlabArray), "MATLAB:invalidType"); + tc.verifyError(@() tc.ArrowArrayConstructorFcn(matlabArray), "arrow:array:InvalidType"); % Column vector matlabArray = ['a';'b';'c']; - tc.verifyError(@() tc.ArrowArrayConstructor(matlabArray), "MATLAB:invalidType"); + tc.verifyError(@() tc.ArrowArrayConstructorFcn(matlabArray), "arrow:array:InvalidType"); % Empty char (0x0) matlabArray = ''; - tc.verifyError(@() tc.ArrowArrayConstructor(matlabArray), "MATLAB:invalidType"); + tc.verifyError(@() tc.ArrowArrayConstructorFcn(matlabArray), "arrow:array:InvalidType"); % Empty char (0x1) matlabArray = char.empty(0, 1); - tc.verifyError(@() tc.ArrowArrayConstructor(matlabArray), "MATLAB:invalidType"); + tc.verifyError(@() tc.ArrowArrayConstructorFcn(matlabArray), "arrow:array:InvalidType"); % Empty char (1x0) matlabArray = char.empty(1, 0); - tc.verifyError(@() tc.ArrowArrayConstructor(matlabArray), "MATLAB:invalidType"); + tc.verifyError(@() tc.ArrowArrayConstructorFcn(matlabArray), "arrow:array:InvalidType"); end end end diff --git a/matlab/test/arrow/array/tTimestampArray.m b/matlab/test/arrow/array/tTimestampArray.m index b0a902f319bf1..5b7182c386c25 100644 --- a/matlab/test/arrow/array/tTimestampArray.m +++ b/matlab/test/arrow/array/tTimestampArray.m @@ -16,6 +16,10 @@ classdef tTimestampArray < matlab.unittest.TestCase % Tests for arrow.array.TimestampArray + properties + ArrowArrayConstructorFcn = @arrow.array.TimestampArray.fromMATLAB + end + properties(TestParameter) TimeZone = {"" "America/New_York"} TimeUnit = {arrow.type.TimeUnit.Second arrow.type.TimeUnit.Millisecond @@ -25,26 +29,25 @@ methods(Test) function Basic(tc, TimeZone) dates = datetime(2023, 6, 22, TimeZone=TimeZone) + days(0:4); - arrowArray = arrow.array.TimestampArray(dates); + arrowArray = tc.ArrowArrayConstructorFcn(dates); className = string(class(arrowArray)); tc.verifyEqual(className, "arrow.array.TimestampArray"); end function TestLength(testCase, TimeZone) % Verify the Length property. - import arrow.array.TimestampArray dates = datetime.empty(0, 1); dates.TimeZone = TimeZone; - arrowArray = TimestampArray(dates); + arrowArray = testCase.ArrowArrayConstructorFcn(dates); testCase.verifyEqual(arrowArray.Length, int64(0)); dates = datetime(2023, 6, 22, TimeZone=TimeZone); - arrowArray = TimestampArray(dates); + arrowArray = testCase.ArrowArrayConstructorFcn(dates); testCase.verifyEqual(arrowArray.Length, int64(1)); dates = datetime(2023, 6, 22, TimeZone=TimeZone) + days(0:4); - arrowArray = TimestampArray(dates); + arrowArray = testCase.ArrowArrayConstructorFcn(dates); testCase.verifyEqual(arrowArray.Length, int64(5)); end @@ -54,76 +57,72 @@ function TestDefaultTimestampType(testCase, TimeZone) import arrow.array.TimestampArray dates = datetime(2023, 6, 22, TimeZone=TimeZone) + days(0:4); - arrowArray = TimestampArray(dates); + arrowArray = testCase.ArrowArrayConstructorFcn(dates); testCase.verifyTimestampType(arrowArray.Type, arrow.type.TimeUnit.Microsecond, TimeZone); end function TestSupplyTimeUnit(testCase, TimeZone) % Supply the TimeUnit name-value pair at construction. - import arrow.array.TimestampArray dates = datetime(2023, 6, 22, TimeZone=TimeZone) + days(0:4); - arrowArray = TimestampArray(dates, TimeUnit="Second"); + arrowArray = testCase.ArrowArrayConstructorFcn(dates, TimeUnit="Second"); testCase.verifyTimestampType(arrowArray.Type, arrow.type.TimeUnit.Second, TimeZone); - arrowArray = TimestampArray(dates, TimeUnit="Millisecond"); + arrowArray = testCase.ArrowArrayConstructorFcn(dates, TimeUnit="Millisecond"); testCase.verifyTimestampType(arrowArray.Type, arrow.type.TimeUnit.Millisecond, TimeZone); - arrowArray = TimestampArray(dates, TimeUnit="Microsecond"); + arrowArray = testCase.ArrowArrayConstructorFcn(dates, TimeUnit="Microsecond"); testCase.verifyTimestampType(arrowArray.Type, arrow.type.TimeUnit.Microsecond, TimeZone); - arrowArray = TimestampArray(dates, TimeUnit="Nanosecond"); + arrowArray = testCase.ArrowArrayConstructorFcn(dates, TimeUnit="Nanosecond"); testCase.verifyTimestampType(arrowArray.Type, arrow.type.TimeUnit.Nanosecond, TimeZone); end function TestToMATLAB(testCase, TimeUnit, TimeZone) % Verify toMATLAB() round-trips the original datetime array. - import arrow.array.TimestampArray dates = datetime(2023, 6, 22, TimeZone=TimeZone) + days(0:4); - arrowArray = arrow.array.TimestampArray(dates, TimeUnit=TimeUnit); + arrowArray = testCase.ArrowArrayConstructorFcn(dates, TimeUnit=TimeUnit); values = toMATLAB(arrowArray); testCase.verifyEqual(values, dates'); end function TestDatetime(testCase, TimeUnit, TimeZone) % Verify datetime() round-trips the original datetime array. - import arrow.array.TimestampArray dates = datetime(2023, 6, 22, TimeZone=TimeZone) + days(0:4); - arrowArray = arrow.array.TimestampArray(dates, TimeUnit=TimeUnit); + arrowArray = testCase.ArrowArrayConstructorFcn(dates, TimeUnit=TimeUnit); values = datetime(arrowArray); testCase.verifyEqual(values, dates'); end function TestValid(testCase, TimeZone) % Verify the Valid property returns the expected logical vector. - import arrow.array.TimestampArray + dates = datetime(2023, 6, 22, TimeZone=TimeZone) + days(0:4); dates([2 4]) = NaT; - arrowArray = arrow.array.TimestampArray(dates); + arrowArray = testCase.ArrowArrayConstructorFcn(dates); testCase.verifyEqual(arrowArray.Valid, [true; false; true; false; true]); testCase.verifyEqual(toMATLAB(arrowArray), dates'); testCase.verifyEqual(datetime(arrowArray), dates'); end function TestInferNulls(testCase, TimeUnit, TimeZone) - import arrow.array.TimestampArray dates = datetime(2023, 6, 22, TimeZone=TimeZone) + days(0:4); dates([2 4]) = NaT; % Verify NaT is treated as a null value if InferNulls=true. expectedDates = dates'; - arrowArray = arrow.array.TimestampArray(dates, TimeUnit=TimeUnit, InferNulls=true); + arrowArray = testCase.ArrowArrayConstructorFcn(dates, TimeUnit=TimeUnit, InferNulls=true); testCase.verifyEqual(arrowArray.Valid, [true; false; true; false; true]); testCase.verifyEqual(toMATLAB(arrowArray), expectedDates); % Verify NaT is not treated as a null value if InferNulls=false. % The NaT values are mapped to int64(0). - arrowArray = arrow.array.TimestampArray(dates, TimeUnit=TimeUnit, InferNulls=false); + arrowArray = testCase.ArrowArrayConstructorFcn(dates, TimeUnit=TimeUnit, InferNulls=false); testCase.verifyEqual(arrowArray.Valid, [true; true; true; true; true]); % If the TimestampArray is zoned, int64(0) may not correspond @@ -134,13 +133,12 @@ function TestInferNulls(testCase, TimeUnit, TimeZone) end function TestValidNVPair(testCase, TimeUnit, TimeZone) - import arrow.array.TimestampArray dates = datetime(2023, 6, 22, TimeZone=TimeZone) + days(0:4); dates([2 4]) = NaT; % Supply the Valid name-value pair as vector of indices. - arrowArray = arrow.array.TimestampArray(dates, TimeUnit=TimeUnit, Valid=[1 2 5]); + arrowArray = testCase.ArrowArrayConstructorFcn(dates, TimeUnit=TimeUnit, Valid=[1 2 5]); testCase.verifyEqual(arrowArray.Valid, [true; true; false; false; true]); expectedDates = dates'; expectedDates(2) = getFillValue(TimeZone); @@ -148,33 +146,41 @@ function TestValidNVPair(testCase, TimeUnit, TimeZone) testCase.verifyEqual(toMATLAB(arrowArray), expectedDates); % Supply the Valid name-value pair as a logical scalar. - arrowArray = arrow.array.TimestampArray(dates, TimeUnit=TimeUnit, Valid=false); + arrowArray = testCase.ArrowArrayConstructorFcn(dates, TimeUnit=TimeUnit, Valid=false); testCase.verifyEqual(arrowArray.Valid, [false; false; false; false; false]); expectedDates(:) = NaT; testCase.verifyEqual(toMATLAB(arrowArray), expectedDates); end function ErrorIfNonVector(testCase) - import arrow.array.TimestampArray dates = datetime(2023, 6, 2) + days(0:11); dates = reshape(dates, 2, 6); - fcn = @() TimestampArray(dates); - testCase.verifyError(fcn, "MATLAB:expectedVector"); + fcn = @() testCase.ArrowArrayConstructorFcn(dates); + testCase.verifyError(fcn, "arrow:array:InvalidShape"); dates = reshape(dates, 3, 2, 2); - fcn = @() TimestampArray(dates); - testCase.verifyError(fcn, "MATLAB:expectedVector"); + fcn = @() testCase.ArrowArrayConstructorFcn(dates); + testCase.verifyError(fcn, "arrow:array:InvalidShape"); end function EmptyDatetimeVector(testCase) import arrow.array.TimestampArray dates = datetime.empty(0, 0); - arrowArray = TimestampArray(dates); + arrowArray = testCase.ArrowArrayConstructorFcn(dates); + testCase.verifyEqual(arrowArray.Length, int64(0)); + testCase.verifyEqual(arrowArray.Valid, logical.empty(0, 1)); + testCase.verifyEqual(toMATLAB(arrowArray), datetime.empty(0, 1)); + + % test with NDimensional empty array + dates = datetime.empty(0, 1, 0); + arrowArray = testCase.ArrowArrayConstructorFcn(dates); testCase.verifyEqual(arrowArray.Length, int64(0)); testCase.verifyEqual(arrowArray.Valid, logical.empty(0, 1)); testCase.verifyEqual(toMATLAB(arrowArray), datetime.empty(0, 1)); + + end end diff --git a/matlab/test/arrow/array/tUInt16Array.m b/matlab/test/arrow/array/tUInt16Array.m index 705d6eabc0b7b..c3445b7c7c6ea 100644 --- a/matlab/test/arrow/array/tUInt16Array.m +++ b/matlab/test/arrow/array/tUInt16Array.m @@ -18,7 +18,7 @@ properties ArrowArrayClassName = "arrow.array.UInt16Array" - ArrowArrayConstructor = @arrow.array.UInt16Array + ArrowArrayConstructorFcn = @arrow.array.UInt16Array.fromMATLAB MatlabConversionFcn = @uint16 % uint16 method on class MatlabArrayFcn = @uint16 % uint16 function MaxValue = intmax("uint16") diff --git a/matlab/test/arrow/array/tUInt32Array.m b/matlab/test/arrow/array/tUInt32Array.m index 267a687738e44..e6372b8db79f4 100644 --- a/matlab/test/arrow/array/tUInt32Array.m +++ b/matlab/test/arrow/array/tUInt32Array.m @@ -18,7 +18,7 @@ properties ArrowArrayClassName = "arrow.array.UInt32Array" - ArrowArrayConstructor = @arrow.array.UInt32Array + ArrowArrayConstructorFcn = @arrow.array.UInt32Array.fromMATLAB MatlabConversionFcn = @uint32 % uint32 method on class MatlabArrayFcn = @uint32 % uint32 function MaxValue = intmax("uint32") diff --git a/matlab/test/arrow/array/tUInt64Array.m b/matlab/test/arrow/array/tUInt64Array.m index b1a23a004de69..16c3cc81cdef0 100644 --- a/matlab/test/arrow/array/tUInt64Array.m +++ b/matlab/test/arrow/array/tUInt64Array.m @@ -18,7 +18,7 @@ properties ArrowArrayClassName = "arrow.array.UInt64Array" - ArrowArrayConstructor = @arrow.array.UInt64Array + ArrowArrayConstructorFcn = @arrow.array.UInt64Array.fromMATLAB MatlabConversionFcn = @uint64 % uint64 method on class MatlabArrayFcn = @uint64 % uint64 function MaxValue = intmax("uint64") diff --git a/matlab/test/arrow/array/tUInt8Array.m b/matlab/test/arrow/array/tUInt8Array.m index 3db79f8c0b16d..a27e3442eda89 100644 --- a/matlab/test/arrow/array/tUInt8Array.m +++ b/matlab/test/arrow/array/tUInt8Array.m @@ -18,7 +18,7 @@ properties ArrowArrayClassName = "arrow.array.UInt8Array" - ArrowArrayConstructor = @arrow.array.UInt8Array + ArrowArrayConstructorFcn = @arrow.array.UInt8Array.fromMATLAB MatlabConversionFcn = @uint8 % uint8 method on class MatlabArrayFcn = @uint8 % uint8 function MaxValue = intmax("uint8") diff --git a/matlab/test/arrow/internal/validate/tNonsparse.m b/matlab/test/arrow/internal/validate/tNonsparse.m new file mode 100644 index 0000000000000..e2eb363fca316 --- /dev/null +++ b/matlab/test/arrow/internal/validate/tNonsparse.m @@ -0,0 +1,44 @@ +%TNONSPARSE Unit tests for arrow.internal.validate.nonsparse. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. +classdef tNonsparse < matlab.unittest.TestCase + + methods(Test) + % Test methods + function ErrorIfSparseDouble(testCase) + fcn = @() arrow.internal.validate.nonsparse(sparse(ones([10 1]))); + errid = "arrow:array:Sparse"; + testCase.verifyError(fcn, errid); + end + + function ErrorIfSparseLogical(testCase) + fcn = @() arrow.internal.validate.nonsparse(sparse(true([10 1]))); + errid = "arrow:array:Sparse"; + testCase.verifyError(fcn, errid); + end + + function NoErrorIfNonSparseDouble(testCase) + fcn = @() arrow.internal.validate.nonsparse(ones([10 1])); + testCase.verifyWarningFree(fcn); + end + + function NoErrorIfNonSparseLogical(testCase) + fcn = @() arrow.internal.validate.nonsparse(true([10 1])); + testCase.verifyWarningFree(fcn); + end + end + +end \ No newline at end of file diff --git a/matlab/test/arrow/internal/validate/tNumeric.m b/matlab/test/arrow/internal/validate/tNumeric.m new file mode 100644 index 0000000000000..91c94233ded0b --- /dev/null +++ b/matlab/test/arrow/internal/validate/tNumeric.m @@ -0,0 +1,56 @@ +%TNUMERIC Unit tests for arrow.internal.validate.numeric. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef tNumeric < matlab.unittest.TestCase + + methods(Test) + + function ErrorIfWrongType(testCase) + data = [1 2 2 4]; + fcn = @() arrow.internal.validate.numeric(data, "single"); + errid = "arrow:array:InvalidType"; + testCase.verifyError(fcn, errid); + end + + function ErrorIfNonVector(testCase) + data = [1 2; 2 4]; + fcn = @() arrow.internal.validate.numeric(data, "double"); + errid = "arrow:array:InvalidShape"; + testCase.verifyError(fcn, errid); + end + + function ErrorIfNonsparse(testCase) + data = sparse([1 1 2 2]); + fcn = @() arrow.internal.validate.numeric(data, "double"); + errid = "arrow:array:Sparse"; + testCase.verifyError(fcn, errid); + end + + function ErrorIfComplex(testCase) + data = [10 + 3i 11 + 4i]; + fcn = @() arrow.internal.validate.numeric(data, "double"); + errid = "arrow:array:ComplexNumeric"; + testCase.verifyError(fcn, errid); + end + + function NoErrorIfRealNonsparseNumericVector(testCase) + data = [10 11 12 13]; + fcn = @() arrow.internal.validate.numeric(data, "double"); + testCase.verifyWarningFree(fcn); + end + end +end \ No newline at end of file diff --git a/matlab/test/arrow/args/tParseValidElements.m b/matlab/test/arrow/internal/validate/tParseValidElements.m similarity index 99% rename from matlab/test/arrow/args/tParseValidElements.m rename to matlab/test/arrow/internal/validate/tParseValidElements.m index 9d7586db9536a..9751327adc6f7 100644 --- a/matlab/test/arrow/args/tParseValidElements.m +++ b/matlab/test/arrow/internal/validate/tParseValidElements.m @@ -186,5 +186,5 @@ function AllElementsAreValid(testCase) opts.InferNulls = true; opts.Valid end - validElements = arrow.args.parseValidElements(data, opts); + validElements = arrow.internal.validate.parseValidElements(data, opts); end \ No newline at end of file diff --git a/matlab/test/arrow/internal/validate/tRealNumeric.m b/matlab/test/arrow/internal/validate/tRealNumeric.m new file mode 100644 index 0000000000000..9b45a6e0b561c --- /dev/null +++ b/matlab/test/arrow/internal/validate/tRealNumeric.m @@ -0,0 +1,43 @@ +%TREALNUMERIC Unit tests for arrow.internal.validate.realnumeric. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef tRealNumeric < matlab.unittest.TestCase + + properties(TestParameter) + NumericType = struct(uint8=@uint8, ... + uint16=@uint16, ... + uint32=@uint32, ... + uint64=@uint64, ... + int8=@int8,... + int16=@int16, ... + int32=@int32, ... + int64=@int64, ... + single=@single, ... + double=@double); + end + + + methods(Test) + function ErrorIfComplex(testCase, NumericType) + complexValue = NumericType(10 + 1i); + fcn = @() arrow.internal.validate.realnumeric(complexValue); + errID = "arrow:array:ComplexNumeric"; + testCase.verifyError(fcn, errID); + end + end + +end \ No newline at end of file diff --git a/matlab/test/arrow/internal/validate/tShape.m b/matlab/test/arrow/internal/validate/tShape.m new file mode 100644 index 0000000000000..52720a5e2508e --- /dev/null +++ b/matlab/test/arrow/internal/validate/tShape.m @@ -0,0 +1,69 @@ +%TSHAPE Unit tests for arrow.internal.validate.shape. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef tShape < matlab.unittest.TestCase + + methods(Test) + function ErrorIf2DimensionalMatrix(testCase) + data = [1 2; 4 5]; + fcn = @() arrow.internal.validate.shape(data); + errID = "arrow:array:InvalidShape"; + testCase.verifyError(fcn, errID); + end + + function ErrorIfNDMatrix(testCase) + data = ones([2 2 3]); + fcn = @() arrow.internal.validate.shape(data); + errID = "arrow:array:InvalidShape"; + testCase.verifyError(fcn, errID); + end + + function NoErrorIfRowVector(testCase) + data = [1 2 4 5]; + fcn = @() arrow.internal.validate.shape(data); + testCase.verifyWarningFree(fcn); + end + + function NoErrorIfColumnVector(testCase) + data = [1 2 4 5]'; + fcn = @() arrow.internal.validate.shape(data); + testCase.verifyWarningFree(fcn); + end + + function NoErrorIfEmpty(testCase) + data = double.empty(0, 0); + fcn = @() arrow.internal.validate.shape(data); + testCase.verifyWarningFree(fcn); + + data = double.empty(0, 1); + fcn = @() arrow.internal.validate.shape(data); + testCase.verifyWarningFree(fcn); + + data = double.empty(1, 0); + fcn = @() arrow.internal.validate.shape(data); + testCase.verifyWarningFree(fcn); + + data = double.empty(0, 1, 0); + fcn = @() arrow.internal.validate.shape(data); + testCase.verifyWarningFree(fcn); + + data = double.empty(0, 0, 5); + fcn = @() arrow.internal.validate.shape(data); + testCase.verifyWarningFree(fcn); + end + end +end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+args/validateTypeAndShape.m b/matlab/test/arrow/internal/validate/tType.m similarity index 55% rename from matlab/src/matlab/+arrow/+args/validateTypeAndShape.m rename to matlab/test/arrow/internal/validate/tType.m index 78e8dd1efe4dd..38ec58c3afffb 100644 --- a/matlab/src/matlab/+arrow/+args/validateTypeAndShape.m +++ b/matlab/test/arrow/internal/validate/tType.m @@ -1,3 +1,5 @@ +%TTYPE Unit tests for arrow.internal.validate.type. + % Licensed to the Apache Software Foundation (ASF) under one or more % contributor license agreements. See the NOTICE file distributed with % this work for additional information regarding copyright ownership. @@ -13,24 +15,20 @@ % implied. See the License for the specific language governing % permissions and limitations under the License. -function validateTypeAndShape(data, type) -% Validates data has the expected type and is a vector or empty 2D -% matrix. If data is numeric, validates is real and nonsparse. - - arguments - data - type(1, 1) string - end - - % If data is empty, only require it's shape to be 2D to support 0x0 - % arrays. Otherwise, require data to be a vector. - % - % TODO: Consider supporting nonvector 2D arrays. We chould reshape them - % to column vectors if needed. +classdef tType < matlab.unittest.TestCase - expectedShape = "vector"; - if isempty(data) - expectedShape = "2d"; + methods(Test) + function ErrorIfWrongType(testCase) + data = uint64([1 2 3]); + fcn = @() arrow.internal.validate.type(data, "double"); + errid = "arrow:array:InvalidType"; + testCase.verifyError(fcn, errid); + end + + function NoErrorIfRightType(testCase) + data = uint64([1 2 3]); + fcn = @() arrow.internal.validate.type(data, "uint64"); + testCase.verifyWarningFree(fcn); + end end - validateattributes(data, type, [expectedShape, "nonsparse", "real"]); end \ No newline at end of file From 4b70fd4b1eac29cd8ac20083ad644b60fe6483ab Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 4 Aug 2023 05:50:50 +0900 Subject: [PATCH 089/749] GH-36856: [C++] Remove needless braces from BasicDecimal256FromLE() arguments (#36987) ### Rationale for this change BasicDecimal256FromLE() requires 4 arguments. So we should not use BasicDecimal256FromLE({X, X, X, X}) form. I don't know why the current form works on amd64 but doesn't work on s390x. ### What changes are included in this PR? Removed needless "{" and "}". ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. * Closes: #36856 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- cpp/src/arrow/util/decimal_internal.h | 447 +++++++++++++------------- 1 file changed, 220 insertions(+), 227 deletions(-) diff --git a/cpp/src/arrow/util/decimal_internal.h b/cpp/src/arrow/util/decimal_internal.h index 51a7229ab6678..89f755af88316 100644 --- a/cpp/src/arrow/util/decimal_internal.h +++ b/cpp/src/arrow/util/decimal_internal.h @@ -197,235 +197,228 @@ constexpr BasicDecimal128 kDecimal128HalfPowersOfTen[] = { #endif constexpr BasicDecimal256 kDecimal256PowersOfTen[76 + 1] = { - BasicDecimal256FromLE({1ULL, 0ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({10ULL, 0ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({100ULL, 0ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({1000ULL, 0ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({10000ULL, 0ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({100000ULL, 0ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({1000000ULL, 0ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({10000000ULL, 0ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({100000000ULL, 0ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({1000000000ULL, 0ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({10000000000ULL, 0ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({100000000000ULL, 0ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({1000000000000ULL, 0ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({10000000000000ULL, 0ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({100000000000000ULL, 0ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({1000000000000000ULL, 0ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({10000000000000000ULL, 0ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({100000000000000000ULL, 0ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({1000000000000000000ULL, 0ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({10000000000000000000ULL, 0ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({7766279631452241920ULL, 5ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({3875820019684212736ULL, 54ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({1864712049423024128ULL, 542ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({200376420520689664ULL, 5421ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({2003764205206896640ULL, 54210ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({1590897978359414784ULL, 542101ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({15908979783594147840ULL, 5421010ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({11515845246265065472ULL, 54210108ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({4477988020393345024ULL, 542101086ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({7886392056514347008ULL, 5421010862ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({5076944270305263616ULL, 54210108624ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({13875954555633532928ULL, 542101086242ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({9632337040368467968ULL, 5421010862427ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({4089650035136921600ULL, 54210108624275ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({4003012203950112768ULL, 542101086242752ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({3136633892082024448ULL, 5421010862427522ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({12919594847110692864ULL, 54210108624275221ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({68739955140067328ULL, 542101086242752217ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({687399551400673280ULL, 5421010862427522170ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({6873995514006732800ULL, 17316620476856118468ULL, 2ULL, 0ULL}), - BasicDecimal256FromLE({13399722918938673152ULL, 7145508105175220139ULL, 29ULL, 0ULL}), - BasicDecimal256FromLE( - {4870020673419870208ULL, 16114848830623546549ULL, 293ULL, 0ULL}), - BasicDecimal256FromLE( - {11806718586779598848ULL, 13574535716559052564ULL, 2938ULL, 0ULL}), - BasicDecimal256FromLE( - {7386721425538678784ULL, 6618148649623664334ULL, 29387ULL, 0ULL}), - BasicDecimal256FromLE( - {80237960548581376ULL, 10841254275107988496ULL, 293873ULL, 0ULL}), - BasicDecimal256FromLE( - {802379605485813760ULL, 16178822382532126880ULL, 2938735ULL, 0ULL}), - BasicDecimal256FromLE( - {8023796054858137600ULL, 14214271235644855872ULL, 29387358ULL, 0ULL}), - BasicDecimal256FromLE( - {6450984253743169536ULL, 13015503840481697412ULL, 293873587ULL, 0ULL}), - BasicDecimal256FromLE( - {9169610316303040512ULL, 1027829888850112811ULL, 2938735877ULL, 0ULL}), - BasicDecimal256FromLE( - {17909126868192198656ULL, 10278298888501128114ULL, 29387358770ULL, 0ULL}), - BasicDecimal256FromLE( - {13070572018536022016ULL, 10549268516463523069ULL, 293873587705ULL, 0ULL}), - BasicDecimal256FromLE( - {1578511669393358848ULL, 13258964796087472617ULL, 2938735877055ULL, 0ULL}), - BasicDecimal256FromLE( - {15785116693933588480ULL, 3462439444907864858ULL, 29387358770557ULL, 0ULL}), - BasicDecimal256FromLE( - {10277214349659471872ULL, 16177650375369096972ULL, 293873587705571ULL, 0ULL}), - BasicDecimal256FromLE( - {10538423128046960640ULL, 14202551164014556797ULL, 2938735877055718ULL, 0ULL}), - BasicDecimal256FromLE( - {13150510911921848320ULL, 12898303124178706663ULL, 29387358770557187ULL, 0ULL}), - BasicDecimal256FromLE( - {2377900603251621888ULL, 18302566799529756941ULL, 293873587705571876ULL, 0ULL}), - BasicDecimal256FromLE( - {5332261958806667264ULL, 17004971331911604867ULL, 2938735877055718769ULL, 0ULL}), - BasicDecimal256FromLE( - {16429131440647569408ULL, 4029016655730084128ULL, 10940614696847636083ULL, 1ULL}), - BasicDecimal256FromLE({16717361816799281152ULL, 3396678409881738056ULL, - 17172426599928602752ULL, 15ULL}), - BasicDecimal256FromLE({1152921504606846976ULL, 15520040025107828953ULL, - 5703569335900062977ULL, 159ULL}), - BasicDecimal256FromLE({11529215046068469760ULL, 7626447661401876602ULL, - 1695461137871974930ULL, 1593ULL}), - BasicDecimal256FromLE({4611686018427387904ULL, 2477500319180559562ULL, - 16954611378719749304ULL, 15930ULL}), - BasicDecimal256FromLE({9223372036854775808ULL, 6328259118096044006ULL, - 3525417123811528497ULL, 159309ULL}), - BasicDecimal256FromLE( - {0ULL, 7942358959831785217ULL, 16807427164405733357ULL, 1593091ULL}), - BasicDecimal256FromLE( - {0ULL, 5636613303479645706ULL, 2053574980671369030ULL, 15930919ULL}), - BasicDecimal256FromLE( - {0ULL, 1025900813667802212ULL, 2089005733004138687ULL, 159309191ULL}), - BasicDecimal256FromLE( - {0ULL, 10259008136678022120ULL, 2443313256331835254ULL, 1593091911ULL}), - BasicDecimal256FromLE( - {0ULL, 10356360998232463120ULL, 5986388489608800929ULL, 15930919111ULL}), - BasicDecimal256FromLE( - {0ULL, 11329889613776873120ULL, 4523652674959354447ULL, 159309191113ULL}), - BasicDecimal256FromLE( - {0ULL, 2618431695511421504ULL, 8343038602174441244ULL, 1593091911132ULL}), - BasicDecimal256FromLE( - {0ULL, 7737572881404663424ULL, 9643409726906205977ULL, 15930919111324ULL}), - BasicDecimal256FromLE( - {0ULL, 3588752519208427776ULL, 4200376900514301694ULL, 159309191113245ULL}), - BasicDecimal256FromLE( - {0ULL, 17440781118374726144ULL, 5110280857723913709ULL, 1593091911132452ULL}), - BasicDecimal256FromLE( - {0ULL, 8387114520361296896ULL, 14209320429820033867ULL, 15930919111324522ULL}), - BasicDecimal256FromLE( - {0ULL, 10084168908774762496ULL, 12965995782233477362ULL, 159309191113245227ULL}), - BasicDecimal256FromLE( - {0ULL, 8607968719199866880ULL, 532749306367912313ULL, 1593091911132452277ULL})}; + BasicDecimal256FromLE(1ULL, 0ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(10ULL, 0ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(100ULL, 0ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(1000ULL, 0ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(10000ULL, 0ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(100000ULL, 0ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(1000000ULL, 0ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(10000000ULL, 0ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(100000000ULL, 0ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(1000000000ULL, 0ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(10000000000ULL, 0ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(100000000000ULL, 0ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(1000000000000ULL, 0ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(10000000000000ULL, 0ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(100000000000000ULL, 0ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(1000000000000000ULL, 0ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(10000000000000000ULL, 0ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(100000000000000000ULL, 0ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(1000000000000000000ULL, 0ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(10000000000000000000ULL, 0ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(7766279631452241920ULL, 5ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(3875820019684212736ULL, 54ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(1864712049423024128ULL, 542ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(200376420520689664ULL, 5421ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(2003764205206896640ULL, 54210ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(1590897978359414784ULL, 542101ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(15908979783594147840ULL, 5421010ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(11515845246265065472ULL, 54210108ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(4477988020393345024ULL, 542101086ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(7886392056514347008ULL, 5421010862ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(5076944270305263616ULL, 54210108624ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(13875954555633532928ULL, 542101086242ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(9632337040368467968ULL, 5421010862427ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(4089650035136921600ULL, 54210108624275ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(4003012203950112768ULL, 542101086242752ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(3136633892082024448ULL, 5421010862427522ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(12919594847110692864ULL, 54210108624275221ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(68739955140067328ULL, 542101086242752217ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(687399551400673280ULL, 5421010862427522170ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(6873995514006732800ULL, 17316620476856118468ULL, 2ULL, 0ULL), + BasicDecimal256FromLE(13399722918938673152ULL, 7145508105175220139ULL, 29ULL, 0ULL), + BasicDecimal256FromLE(4870020673419870208ULL, 16114848830623546549ULL, 293ULL, 0ULL), + BasicDecimal256FromLE(11806718586779598848ULL, 13574535716559052564ULL, 2938ULL, + 0ULL), + BasicDecimal256FromLE(7386721425538678784ULL, 6618148649623664334ULL, 29387ULL, 0ULL), + BasicDecimal256FromLE(80237960548581376ULL, 10841254275107988496ULL, 293873ULL, 0ULL), + BasicDecimal256FromLE(802379605485813760ULL, 16178822382532126880ULL, 2938735ULL, + 0ULL), + BasicDecimal256FromLE(8023796054858137600ULL, 14214271235644855872ULL, 29387358ULL, + 0ULL), + BasicDecimal256FromLE(6450984253743169536ULL, 13015503840481697412ULL, 293873587ULL, + 0ULL), + BasicDecimal256FromLE(9169610316303040512ULL, 1027829888850112811ULL, 2938735877ULL, + 0ULL), + BasicDecimal256FromLE(17909126868192198656ULL, 10278298888501128114ULL, + 29387358770ULL, 0ULL), + BasicDecimal256FromLE(13070572018536022016ULL, 10549268516463523069ULL, + 293873587705ULL, 0ULL), + BasicDecimal256FromLE(1578511669393358848ULL, 13258964796087472617ULL, + 2938735877055ULL, 0ULL), + BasicDecimal256FromLE(15785116693933588480ULL, 3462439444907864858ULL, + 29387358770557ULL, 0ULL), + BasicDecimal256FromLE(10277214349659471872ULL, 16177650375369096972ULL, + 293873587705571ULL, 0ULL), + BasicDecimal256FromLE(10538423128046960640ULL, 14202551164014556797ULL, + 2938735877055718ULL, 0ULL), + BasicDecimal256FromLE(13150510911921848320ULL, 12898303124178706663ULL, + 29387358770557187ULL, 0ULL), + BasicDecimal256FromLE(2377900603251621888ULL, 18302566799529756941ULL, + 293873587705571876ULL, 0ULL), + BasicDecimal256FromLE(5332261958806667264ULL, 17004971331911604867ULL, + 2938735877055718769ULL, 0ULL), + BasicDecimal256FromLE(16429131440647569408ULL, 4029016655730084128ULL, + 10940614696847636083ULL, 1ULL), + BasicDecimal256FromLE(16717361816799281152ULL, 3396678409881738056ULL, + 17172426599928602752ULL, 15ULL), + BasicDecimal256FromLE(1152921504606846976ULL, 15520040025107828953ULL, + 5703569335900062977ULL, 159ULL), + BasicDecimal256FromLE(11529215046068469760ULL, 7626447661401876602ULL, + 1695461137871974930ULL, 1593ULL), + BasicDecimal256FromLE(4611686018427387904ULL, 2477500319180559562ULL, + 16954611378719749304ULL, 15930ULL), + BasicDecimal256FromLE(9223372036854775808ULL, 6328259118096044006ULL, + 3525417123811528497ULL, 159309ULL), + BasicDecimal256FromLE(0ULL, 7942358959831785217ULL, 16807427164405733357ULL, + 1593091ULL), + BasicDecimal256FromLE(0ULL, 5636613303479645706ULL, 2053574980671369030ULL, + 15930919ULL), + BasicDecimal256FromLE(0ULL, 1025900813667802212ULL, 2089005733004138687ULL, + 159309191ULL), + BasicDecimal256FromLE(0ULL, 10259008136678022120ULL, 2443313256331835254ULL, + 1593091911ULL), + BasicDecimal256FromLE(0ULL, 10356360998232463120ULL, 5986388489608800929ULL, + 15930919111ULL), + BasicDecimal256FromLE(0ULL, 11329889613776873120ULL, 4523652674959354447ULL, + 159309191113ULL), + BasicDecimal256FromLE(0ULL, 2618431695511421504ULL, 8343038602174441244ULL, + 1593091911132ULL), + BasicDecimal256FromLE(0ULL, 7737572881404663424ULL, 9643409726906205977ULL, + 15930919111324ULL), + BasicDecimal256FromLE(0ULL, 3588752519208427776ULL, 4200376900514301694ULL, + 159309191113245ULL), + BasicDecimal256FromLE(0ULL, 17440781118374726144ULL, 5110280857723913709ULL, + 1593091911132452ULL), + BasicDecimal256FromLE(0ULL, 8387114520361296896ULL, 14209320429820033867ULL, + 15930919111324522ULL), + BasicDecimal256FromLE(0ULL, 10084168908774762496ULL, 12965995782233477362ULL, + 159309191113245227ULL), + BasicDecimal256FromLE(0ULL, 8607968719199866880ULL, 532749306367912313ULL, + 1593091911132452277ULL)}; constexpr BasicDecimal256 kDecimal256HalfPowersOfTen[] = { - BasicDecimal256FromLE({0ULL, 0ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({5ULL, 0ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({50ULL, 0ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({500ULL, 0ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({5000ULL, 0ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({50000ULL, 0ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({500000ULL, 0ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({5000000ULL, 0ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({50000000ULL, 0ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({500000000ULL, 0ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({5000000000ULL, 0ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({50000000000ULL, 0ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({500000000000ULL, 0ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({5000000000000ULL, 0ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({50000000000000ULL, 0ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({500000000000000ULL, 0ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({5000000000000000ULL, 0ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({50000000000000000ULL, 0ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({500000000000000000ULL, 0ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({5000000000000000000ULL, 0ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({13106511852580896768ULL, 2ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({1937910009842106368ULL, 27ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({932356024711512064ULL, 271ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({9323560247115120640ULL, 2710ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({1001882102603448320ULL, 27105ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({10018821026034483200ULL, 271050ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({7954489891797073920ULL, 2710505ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({5757922623132532736ULL, 27105054ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({2238994010196672512ULL, 271050543ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({3943196028257173504ULL, 2710505431ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({2538472135152631808ULL, 27105054312ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({6937977277816766464ULL, 271050543121ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({14039540557039009792ULL, 2710505431213ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({11268197054423236608ULL, 27105054312137ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({2001506101975056384ULL, 271050543121376ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({1568316946041012224ULL, 2710505431213761ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({15683169460410122240ULL, 27105054312137610ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({9257742014424809472ULL, 271050543121376108ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({343699775700336640ULL, 2710505431213761085ULL, 0ULL, 0ULL}), - BasicDecimal256FromLE({3436997757003366400ULL, 8658310238428059234ULL, 1ULL, 0ULL}), - BasicDecimal256FromLE( - {15923233496324112384ULL, 12796126089442385877ULL, 14ULL, 0ULL}), - BasicDecimal256FromLE( - {11658382373564710912ULL, 17280796452166549082ULL, 146ULL, 0ULL}), - BasicDecimal256FromLE( - {5903359293389799424ULL, 6787267858279526282ULL, 1469ULL, 0ULL}), - BasicDecimal256FromLE( - {3693360712769339392ULL, 12532446361666607975ULL, 14693ULL, 0ULL}), - BasicDecimal256FromLE( - {40118980274290688ULL, 14643999174408770056ULL, 146936ULL, 0ULL}), - BasicDecimal256FromLE( - {401189802742906880ULL, 17312783228120839248ULL, 1469367ULL, 0ULL}), - BasicDecimal256FromLE( - {4011898027429068800ULL, 7107135617822427936ULL, 14693679ULL, 0ULL}), - BasicDecimal256FromLE( - {3225492126871584768ULL, 15731123957095624514ULL, 146936793ULL, 0ULL}), - BasicDecimal256FromLE( - {13808177195006296064ULL, 9737286981279832213ULL, 1469367938ULL, 0ULL}), - BasicDecimal256FromLE( - {8954563434096099328ULL, 5139149444250564057ULL, 14693679385ULL, 0ULL}), - BasicDecimal256FromLE( - {15758658046122786816ULL, 14498006295086537342ULL, 146936793852ULL, 0ULL}), - BasicDecimal256FromLE( - {10012627871551455232ULL, 15852854434898512116ULL, 1469367938527ULL, 0ULL}), - BasicDecimal256FromLE( - {7892558346966794240ULL, 10954591759308708237ULL, 14693679385278ULL, 0ULL}), - BasicDecimal256FromLE( - {5138607174829735936ULL, 17312197224539324294ULL, 146936793852785ULL, 0ULL}), - BasicDecimal256FromLE( - {14492583600878256128ULL, 7101275582007278398ULL, 1469367938527859ULL, 0ULL}), - BasicDecimal256FromLE( - {15798627492815699968ULL, 15672523598944129139ULL, 14693679385278593ULL, 0ULL}), - BasicDecimal256FromLE( - {10412322338480586752ULL, 9151283399764878470ULL, 146936793852785938ULL, 0ULL}), - BasicDecimal256FromLE( - {11889503016258109440ULL, 17725857702810578241ULL, 1469367938527859384ULL, 0ULL}), - BasicDecimal256FromLE( - {8214565720323784704ULL, 11237880364719817872ULL, 14693679385278593849ULL, 0ULL}), - BasicDecimal256FromLE( - {8358680908399640576ULL, 1698339204940869028ULL, 17809585336819077184ULL, 7ULL}), - BasicDecimal256FromLE({9799832789158199296ULL, 16983392049408690284ULL, - 12075156704804807296ULL, 79ULL}), - BasicDecimal256FromLE({5764607523034234880ULL, 3813223830700938301ULL, - 10071102605790763273ULL, 796ULL}), - BasicDecimal256FromLE({2305843009213693952ULL, 1238750159590279781ULL, - 8477305689359874652ULL, 7965ULL}), - BasicDecimal256FromLE({4611686018427387904ULL, 12387501595902797811ULL, - 10986080598760540056ULL, 79654ULL}), - BasicDecimal256FromLE({9223372036854775808ULL, 13194551516770668416ULL, - 17627085619057642486ULL, 796545ULL}), - BasicDecimal256FromLE( - {0ULL, 2818306651739822853ULL, 10250159527190460323ULL, 7965459ULL}), - BasicDecimal256FromLE( - {0ULL, 9736322443688676914ULL, 10267874903356845151ULL, 79654595ULL}), - BasicDecimal256FromLE( - {0ULL, 5129504068339011060ULL, 10445028665020693435ULL, 796545955ULL}), - BasicDecimal256FromLE( - {0ULL, 14401552535971007368ULL, 12216566281659176272ULL, 7965459555ULL}), - BasicDecimal256FromLE( - {0ULL, 14888316843743212368ULL, 11485198374334453031ULL, 79654595556ULL}), - BasicDecimal256FromLE( - {0ULL, 1309215847755710752ULL, 4171519301087220622ULL, 796545955566ULL}), - BasicDecimal256FromLE( - {0ULL, 13092158477557107520ULL, 4821704863453102988ULL, 7965459555662ULL}), - BasicDecimal256FromLE( - {0ULL, 1794376259604213888ULL, 11323560487111926655ULL, 79654595556622ULL}), - BasicDecimal256FromLE( - {0ULL, 17943762596042138880ULL, 2555140428861956854ULL, 796545955566226ULL}), - BasicDecimal256FromLE( - {0ULL, 13416929297035424256ULL, 7104660214910016933ULL, 7965459555662261ULL}), - BasicDecimal256FromLE( - {0ULL, 5042084454387381248ULL, 15706369927971514489ULL, 79654595556622613ULL}), - BasicDecimal256FromLE( - {0ULL, 13527356396454709248ULL, 9489746690038731964ULL, 796545955566226138ULL})}; + BasicDecimal256FromLE(0ULL, 0ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(5ULL, 0ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(50ULL, 0ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(500ULL, 0ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(5000ULL, 0ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(50000ULL, 0ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(500000ULL, 0ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(5000000ULL, 0ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(50000000ULL, 0ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(500000000ULL, 0ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(5000000000ULL, 0ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(50000000000ULL, 0ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(500000000000ULL, 0ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(5000000000000ULL, 0ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(50000000000000ULL, 0ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(500000000000000ULL, 0ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(5000000000000000ULL, 0ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(50000000000000000ULL, 0ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(500000000000000000ULL, 0ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(5000000000000000000ULL, 0ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(13106511852580896768ULL, 2ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(1937910009842106368ULL, 27ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(932356024711512064ULL, 271ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(9323560247115120640ULL, 2710ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(1001882102603448320ULL, 27105ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(10018821026034483200ULL, 271050ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(7954489891797073920ULL, 2710505ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(5757922623132532736ULL, 27105054ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(2238994010196672512ULL, 271050543ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(3943196028257173504ULL, 2710505431ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(2538472135152631808ULL, 27105054312ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(6937977277816766464ULL, 271050543121ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(14039540557039009792ULL, 2710505431213ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(11268197054423236608ULL, 27105054312137ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(2001506101975056384ULL, 271050543121376ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(1568316946041012224ULL, 2710505431213761ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(15683169460410122240ULL, 27105054312137610ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(9257742014424809472ULL, 271050543121376108ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(343699775700336640ULL, 2710505431213761085ULL, 0ULL, 0ULL), + BasicDecimal256FromLE(3436997757003366400ULL, 8658310238428059234ULL, 1ULL, 0ULL), + BasicDecimal256FromLE(15923233496324112384ULL, 12796126089442385877ULL, 14ULL, 0ULL), + BasicDecimal256FromLE(11658382373564710912ULL, 17280796452166549082ULL, 146ULL, 0ULL), + BasicDecimal256FromLE(5903359293389799424ULL, 6787267858279526282ULL, 1469ULL, 0ULL), + BasicDecimal256FromLE(3693360712769339392ULL, 12532446361666607975ULL, 14693ULL, + 0ULL), + BasicDecimal256FromLE(40118980274290688ULL, 14643999174408770056ULL, 146936ULL, 0ULL), + BasicDecimal256FromLE(401189802742906880ULL, 17312783228120839248ULL, 1469367ULL, + 0ULL), + BasicDecimal256FromLE(4011898027429068800ULL, 7107135617822427936ULL, 14693679ULL, + 0ULL), + BasicDecimal256FromLE(3225492126871584768ULL, 15731123957095624514ULL, 146936793ULL, + 0ULL), + BasicDecimal256FromLE(13808177195006296064ULL, 9737286981279832213ULL, 1469367938ULL, + 0ULL), + BasicDecimal256FromLE(8954563434096099328ULL, 5139149444250564057ULL, 14693679385ULL, + 0ULL), + BasicDecimal256FromLE(15758658046122786816ULL, 14498006295086537342ULL, + 146936793852ULL, 0ULL), + BasicDecimal256FromLE(10012627871551455232ULL, 15852854434898512116ULL, + 1469367938527ULL, 0ULL), + BasicDecimal256FromLE(7892558346966794240ULL, 10954591759308708237ULL, + 14693679385278ULL, 0ULL), + BasicDecimal256FromLE(5138607174829735936ULL, 17312197224539324294ULL, + 146936793852785ULL, 0ULL), + BasicDecimal256FromLE(14492583600878256128ULL, 7101275582007278398ULL, + 1469367938527859ULL, 0ULL), + BasicDecimal256FromLE(15798627492815699968ULL, 15672523598944129139ULL, + 14693679385278593ULL, 0ULL), + BasicDecimal256FromLE(10412322338480586752ULL, 9151283399764878470ULL, + 146936793852785938ULL, 0ULL), + BasicDecimal256FromLE(11889503016258109440ULL, 17725857702810578241ULL, + 1469367938527859384ULL, 0ULL), + BasicDecimal256FromLE(8214565720323784704ULL, 11237880364719817872ULL, + 14693679385278593849ULL, 0ULL), + BasicDecimal256FromLE(8358680908399640576ULL, 1698339204940869028ULL, + 17809585336819077184ULL, 7ULL), + BasicDecimal256FromLE(9799832789158199296ULL, 16983392049408690284ULL, + 12075156704804807296ULL, 79ULL), + BasicDecimal256FromLE(5764607523034234880ULL, 3813223830700938301ULL, + 10071102605790763273ULL, 796ULL), + BasicDecimal256FromLE(2305843009213693952ULL, 1238750159590279781ULL, + 8477305689359874652ULL, 7965ULL), + BasicDecimal256FromLE(4611686018427387904ULL, 12387501595902797811ULL, + 10986080598760540056ULL, 79654ULL), + BasicDecimal256FromLE(9223372036854775808ULL, 13194551516770668416ULL, + 17627085619057642486ULL, 796545ULL), + BasicDecimal256FromLE(0ULL, 2818306651739822853ULL, 10250159527190460323ULL, + 7965459ULL), + BasicDecimal256FromLE(0ULL, 9736322443688676914ULL, 10267874903356845151ULL, + 79654595ULL), + BasicDecimal256FromLE(0ULL, 5129504068339011060ULL, 10445028665020693435ULL, + 796545955ULL), + BasicDecimal256FromLE(0ULL, 14401552535971007368ULL, 12216566281659176272ULL, + 7965459555ULL), + BasicDecimal256FromLE(0ULL, 14888316843743212368ULL, 11485198374334453031ULL, + 79654595556ULL), + BasicDecimal256FromLE(0ULL, 1309215847755710752ULL, 4171519301087220622ULL, + 796545955566ULL), + BasicDecimal256FromLE(0ULL, 13092158477557107520ULL, 4821704863453102988ULL, + 7965459555662ULL), + BasicDecimal256FromLE(0ULL, 1794376259604213888ULL, 11323560487111926655ULL, + 79654595556622ULL), + BasicDecimal256FromLE(0ULL, 17943762596042138880ULL, 2555140428861956854ULL, + 796545955566226ULL), + BasicDecimal256FromLE(0ULL, 13416929297035424256ULL, 7104660214910016933ULL, + 7965459555662261ULL), + BasicDecimal256FromLE(0ULL, 5042084454387381248ULL, 15706369927971514489ULL, + 79654595556622613ULL), + BasicDecimal256FromLE(0ULL, 13527356396454709248ULL, 9489746690038731964ULL, + 796545955566226138ULL)}; #undef BasicDecimal256FromLE From 9b3bf08f78f690605394ad3815177ece931f06d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Thu, 3 Aug 2023 23:04:15 +0200 Subject: [PATCH 090/749] GH-36752: [Python] Remove AWS SDK bundling when building wheels (#36925) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change In https://github.com/apache/arrow/pull/12227 we decided to use a bundled version of the AWS SDK when compiling Python wheels, in order to downgrade the AWS SDK version. Now that we have fixed S3 finalization issues (https://github.com/apache/arrow/pull/36442), it should be ok to rely on the vcpkg-installed version of the AWS SDK again. ### What changes are included in this PR? Remove use of bundled AWS SDK and use S3 vcpkg feature for requirements. ### Are these changes tested? On CI and via crossbow ### Are there any user-facing changes? No * Closes: #36752 Authored-by: Raúl Cumplido Signed-off-by: Sutou Kouhei --- .env | 2 +- ci/docker/python-wheel-manylinux.dockerfile | 3 ++- ci/docker/python-wheel-windows-vs2017.dockerfile | 3 ++- ci/scripts/java_jni_macos_build.sh | 1 - ci/scripts/python_wheel_manylinux_build.sh | 4 ---- ci/scripts/python_wheel_windows_build.bat | 4 ---- 6 files changed, 5 insertions(+), 12 deletions(-) diff --git a/.env b/.env index 25b2743f6542f..c9cd6c8094ed8 100644 --- a/.env +++ b/.env @@ -98,7 +98,7 @@ VCPKG="501db0f17ef6df184fcdbfbe0f87cde2313b6ab1" # 2023.04.15 Release # ci/docker/python-wheel-windows-vs2017.dockerfile. # This is a workaround for our CI problem that "archery docker build" doesn't # use pulled built images in dev/tasks/python-wheels/github.windows.yml. -PYTHON_WHEEL_WINDOWS_IMAGE_REVISION=2022-06-12 +PYTHON_WHEEL_WINDOWS_IMAGE_REVISION=2023-08-02 # Use conanio/${CONAN} for "docker-compose run --rm conan". See # https://github.com/conan-io/conan-docker-tools#readme for available diff --git a/ci/docker/python-wheel-manylinux.dockerfile b/ci/docker/python-wheel-manylinux.dockerfile index ed329ecdef6de..0f7779c878505 100644 --- a/ci/docker/python-wheel-manylinux.dockerfile +++ b/ci/docker/python-wheel-manylinux.dockerfile @@ -84,7 +84,8 @@ RUN vcpkg install \ --x-feature=flight \ --x-feature=gcs \ --x-feature=json \ - --x-feature=parquet + --x-feature=parquet \ + --x-feature=s3 # Configure Python for applications running in the bash shell of this Dockerfile ARG python=3.8 diff --git a/ci/docker/python-wheel-windows-vs2017.dockerfile b/ci/docker/python-wheel-windows-vs2017.dockerfile index 01152dae232d8..531c4e678fafe 100644 --- a/ci/docker/python-wheel-windows-vs2017.dockerfile +++ b/ci/docker/python-wheel-windows-vs2017.dockerfile @@ -66,7 +66,8 @@ RUN vcpkg install \ --x-feature=flight \ --x-feature=gcs \ --x-feature=json \ - --x-feature=parquet + --x-feature=parquet \ + --x-feature=s3 # Remove previous installations of python from the base image # NOTE: a more recent base image (tried with 2.12.1) comes with python 3.9.7 diff --git a/ci/scripts/java_jni_macos_build.sh b/ci/scripts/java_jni_macos_build.sh index 4a6f9444ec25f..d66c39a37c5bd 100755 --- a/ci/scripts/java_jni_macos_build.sh +++ b/ci/scripts/java_jni_macos_build.sh @@ -81,7 +81,6 @@ cmake \ -DARROW_PARQUET=${ARROW_PARQUET} \ -DARROW_S3=${ARROW_S3} \ -DARROW_USE_CCACHE=${ARROW_USE_CCACHE} \ - -DAWSSDK_SOURCE=BUNDLED \ -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \ -DCMAKE_INSTALL_LIBDIR=lib \ -DCMAKE_INSTALL_PREFIX=${install_dir} \ diff --git a/ci/scripts/python_wheel_manylinux_build.sh b/ci/scripts/python_wheel_manylinux_build.sh index cb5c2fbb7cc62..58e42fea88088 100755 --- a/ci/scripts/python_wheel_manylinux_build.sh +++ b/ci/scripts/python_wheel_manylinux_build.sh @@ -85,9 +85,6 @@ fi mkdir /tmp/arrow-build pushd /tmp/arrow-build -# ARROW-17501: We can remove -DAWSSDK_SOURCE=BUNDLED once -# https://github.com/aws/aws-sdk-cpp/issues/1809 is fixed and vcpkg -# ships the fix. cmake \ -DARROW_ACERO=${ARROW_ACERO} \ -DARROW_BUILD_SHARED=ON \ @@ -120,7 +117,6 @@ cmake \ -DARROW_WITH_SNAPPY=${ARROW_WITH_SNAPPY} \ -DARROW_WITH_ZLIB=${ARROW_WITH_ZLIB} \ -DARROW_WITH_ZSTD=${ARROW_WITH_ZSTD} \ - -DAWSSDK_SOURCE=BUNDLED \ -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \ -DCMAKE_INSTALL_LIBDIR=lib \ -DCMAKE_INSTALL_PREFIX=/tmp/arrow-dist \ diff --git a/ci/scripts/python_wheel_windows_build.bat b/ci/scripts/python_wheel_windows_build.bat index ee879c7050dad..ffb43b3481e55 100644 --- a/ci/scripts/python_wheel_windows_build.bat +++ b/ci/scripts/python_wheel_windows_build.bat @@ -49,9 +49,6 @@ set ARROW_WITH_LZ4=ON set ARROW_WITH_SNAPPY=ON set ARROW_WITH_ZLIB=ON set ARROW_WITH_ZSTD=ON -@rem Workaround for https://github.com/aws/aws-sdk-cpp/issues/1809 . -@rem Use (old) bundled AWS SDK C++ instead of (newer) AWS SDK C++. -set AWSSDK_SOURCE=BUNDLED set CMAKE_UNITY_BUILD=ON set CMAKE_GENERATOR=Visual Studio 15 2017 Win64 set VCPKG_ROOT=C:\vcpkg @@ -90,7 +87,6 @@ cmake ^ -DARROW_WITH_SNAPPY=%ARROW_WITH_SNAPPY% ^ -DARROW_WITH_ZLIB=%ARROW_WITH_ZLIB% ^ -DARROW_WITH_ZSTD=%ARROW_WITH_ZSTD% ^ - -DAWSSDK_SOURCE=%AWSSDK_SOURCE% ^ -DCMAKE_BUILD_TYPE=%CMAKE_BUILD_TYPE% ^ -DCMAKE_CXX_COMPILER=clcache ^ -DCMAKE_INSTALL_PREFIX=C:\arrow-dist ^ From fefd96d4f8917a59f1f72c8e0d9f73583b82bf43 Mon Sep 17 00:00:00 2001 From: Kevin Gurney Date: Fri, 4 Aug 2023 14:02:00 -0400 Subject: [PATCH 091/749] GH-36961: [MATLAB] Add `arrow.tabular.Schema` class and associated `arrow.schema` construction function (#37013) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change To continue building out the tabular APIs for the MATLAB interface, this PR adds a new `arrow.tabular.Schema` class which wraps one or more `arrow.type.Field` objects and semantically describes the names and types of the columns of a tabular Arrow data structure. To construct an `arrow.tabular.Schema` object, client code can call an associated `arrow.schema` construction function (similar to the `arrow.field` construction function). This mirrors the tabular APIs in other Arrow bindings, like `pyarrow`. ### What changes are included in this PR? 1. New `arrow.tabular.Schema` class. 2. New `arrow.schema(fields)` construction function for creating instances of `arrow.tabular.Schema`. **Example**: ```matlab >> fieldA = arrow.field("A", arrow.uint8); >> fieldB = arrow.field("B", arrow.string); >> fieldC = arrow.field("C", arrow.timestamp); >> fields = [fieldA, fieldB, fieldC]; >> schema = arrow.schema(fields) schema = A: uint8 B: string C: timestamp[us] >> schema.NumFields ans = int32 3 >> schema.FieldNames ans = 1×3 string array "A" "B" "C" >> f = schema.field(3) f = C: timestamp[us] >> f = schema.field("B") f = B: string ``` ### Are these changes tested? Yes. 1. Added a new test class `tSchema.m` which contains tests for `arrow.schema` and `arrow.tabular.Schema`. ### Are there any user-facing changes? Yes. 1. New public `arrow.tabular.Schema` class. 1.1 **Properties** 1.1.1 `NumFields` 1.1.2 `FieldNames` 1.1.3 `Fields` 1.2 **Methods** 1.2.1 `field(index)` where index is a valid numeric index or field name. 2. New public `arrow.schema(fields)` construction function. ### Future Directions 1. @ sgilmore10 introduced some new input validation functions that are generic and reusable in #36978. To avoid using multiple different approaches to input validation across the MATLAB code base, it would be a good idea to re-implement the input validation for `Schema` methods (e.g. `field`) to use these validation functions consistently. 4. Error handling in some edge cases is less than ideal right now for `Schema`. We should consider doing a more thorough review of error handling and error messages across the MATLAB code base now that we have more APIs and have seen several similar error states appear in different parts of the code base (e.g. indexing errors). 5. We may want to consider alternative construction syntaxes beyond just `arrow.schema(fields)`. For example, `arrow.schema(fieldName_1, fieldType_1, ..., fieldName_i, fieldType_i, ... fieldName_n, fieldType_n)` might be another convenient syntax that we could consider supporting. 6. We should add a `Schema` property to `RecordBatch`. 7. Consider adding a `toMATLAB` method for `Schema` which returns an empty MATLAB `table` with corresponding variable names and MATLAB types. * Closes: #36961 Lead-authored-by: Kevin Gurney Co-authored-by: Kevin Gurney Co-authored-by: Sutou Kouhei Signed-off-by: Kevin Gurney --- matlab/src/cpp/arrow/matlab/error/error.h | 4 + matlab/src/cpp/arrow/matlab/proxy/factory.cc | 2 + .../cpp/arrow/matlab/tabular/proxy/schema.cc | 184 +++++++ .../cpp/arrow/matlab/tabular/proxy/schema.h | 47 ++ matlab/src/matlab/+arrow/+tabular/Schema.m | 96 ++++ matlab/src/matlab/+arrow/schema.m | 34 ++ matlab/test/arrow/tabular/tSchema.m | 473 ++++++++++++++++++ .../cmake/BuildMatlabArrowInterface.cmake | 1 + 8 files changed, 841 insertions(+) create mode 100644 matlab/src/cpp/arrow/matlab/tabular/proxy/schema.cc create mode 100644 matlab/src/cpp/arrow/matlab/tabular/proxy/schema.h create mode 100644 matlab/src/matlab/+arrow/+tabular/Schema.m create mode 100644 matlab/src/matlab/+arrow/schema.m create mode 100644 matlab/test/arrow/tabular/tSchema.m diff --git a/matlab/src/cpp/arrow/matlab/error/error.h b/matlab/src/cpp/arrow/matlab/error/error.h index 3d134d169e7af..590cccfb9d6bf 100644 --- a/matlab/src/cpp/arrow/matlab/error/error.h +++ b/matlab/src/cpp/arrow/matlab/error/error.h @@ -173,4 +173,8 @@ namespace arrow::matlab::error { static const char* UKNOWN_TIME_UNIT_ERROR_ID = "arrow:matlab:UnknownTimeUnit"; static const char* FIELD_FAILED_TO_CREATE_TYPE_PROXY = "arrow:field:FailedToCreateTypeProxy"; static const char* ARRAY_FAILED_TO_CREATE_TYPE_PROXY = "arrow:array:FailedToCreateTypeProxy"; + static const char* ARROW_TABULAR_SCHEMA_INVALID_NUMERIC_FIELD_INDEX = "arrow:tabular:schema:InvalidNumericFieldIndex"; + static const char* ARROW_TABULAR_SCHEMA_UNKNOWN_FIELD_NAME = "arrow:tabular:schema:UnknownFieldName"; + static const char* ARROW_TABULAR_SCHEMA_AMBIGUOUS_FIELD_NAME = "arrow:tabular:schema:AmbiguousFieldName"; + static const char* ARROW_TABULAR_SCHEMA_NUMERIC_FIELD_INDEX_WITH_EMPTY_SCHEMA = "arrow:tabular:schema:NumericFieldIndexWithEmptySchema"; } diff --git a/matlab/src/cpp/arrow/matlab/proxy/factory.cc b/matlab/src/cpp/arrow/matlab/proxy/factory.cc index ac9a595a45852..7d18c6c6b62a8 100644 --- a/matlab/src/cpp/arrow/matlab/proxy/factory.cc +++ b/matlab/src/cpp/arrow/matlab/proxy/factory.cc @@ -19,6 +19,7 @@ #include "arrow/matlab/array/proxy/numeric_array.h" #include "arrow/matlab/array/proxy/string_array.h" #include "arrow/matlab/tabular/proxy/record_batch.h" +#include "arrow/matlab/tabular/proxy/schema.h" #include "arrow/matlab/error/error.h" #include "arrow/matlab/type/proxy/primitive_ctype.h" #include "arrow/matlab/type/proxy/string_type.h" @@ -44,6 +45,7 @@ libmexclass::proxy::MakeResult Factory::make_proxy(const ClassName& class_name, REGISTER_PROXY(arrow.array.proxy.StringArray , arrow::matlab::array::proxy::StringArray); REGISTER_PROXY(arrow.array.proxy.TimestampArray, arrow::matlab::array::proxy::NumericArray); REGISTER_PROXY(arrow.tabular.proxy.RecordBatch , arrow::matlab::tabular::proxy::RecordBatch); + REGISTER_PROXY(arrow.tabular.proxy.Schema , arrow::matlab::tabular::proxy::Schema); REGISTER_PROXY(arrow.type.proxy.Field , arrow::matlab::type::proxy::Field); REGISTER_PROXY(arrow.type.proxy.Float32Type , arrow::matlab::type::proxy::PrimitiveCType); REGISTER_PROXY(arrow.type.proxy.Float64Type , arrow::matlab::type::proxy::PrimitiveCType); diff --git a/matlab/src/cpp/arrow/matlab/tabular/proxy/schema.cc b/matlab/src/cpp/arrow/matlab/tabular/proxy/schema.cc new file mode 100644 index 0000000000000..62fe863ca8b5f --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/tabular/proxy/schema.cc @@ -0,0 +1,184 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/matlab/error/error.h" +#include "arrow/matlab/tabular/proxy/schema.h" +#include "arrow/matlab/type/proxy/field.h" + +#include "libmexclass/proxy/ProxyManager.h" +#include "libmexclass/error/Error.h" + +#include "arrow/util/utf8.h" + +#include + +namespace arrow::matlab::tabular::proxy { + + namespace { + + libmexclass::error::Error makeUnknownFieldNameError(const std::string& name) { + using namespace libmexclass::error; + std::stringstream error_message_stream; + error_message_stream << "Unknown field name: '"; + error_message_stream << name; + error_message_stream << "'."; + return Error{error::ARROW_TABULAR_SCHEMA_UNKNOWN_FIELD_NAME, error_message_stream.str()}; + } + + libmexclass::error::Error makeEmptySchemaError() { + using namespace libmexclass::error; + return Error{error::ARROW_TABULAR_SCHEMA_NUMERIC_FIELD_INDEX_WITH_EMPTY_SCHEMA, + "Numeric indexing using the field method is not supported for schemas with no fields."}; + } + + } + + Schema::Schema(std::shared_ptr schema) : schema{std::move(schema)} { + REGISTER_METHOD(Schema, getFieldByIndex); + REGISTER_METHOD(Schema, getFieldByName); + REGISTER_METHOD(Schema, getNumFields); + REGISTER_METHOD(Schema, getFieldNames); + REGISTER_METHOD(Schema, toString); + } + + libmexclass::proxy::MakeResult Schema::make(const libmexclass::proxy::FunctionArguments& constructor_arguments) { + namespace mda = ::matlab::data; + using SchemaProxy = arrow::matlab::tabular::proxy::Schema; + + mda::StructArray args = constructor_arguments[0]; + const mda::TypedArray field_proxy_ids_mda = args[0]["FieldProxyIDs"]; + + std::vector> fields; + for (const auto proxy_id : field_proxy_ids_mda) { + using namespace libmexclass::proxy; + auto proxy = std::static_pointer_cast(ProxyManager::getProxy(proxy_id)); + auto field = proxy->unwrap(); + fields.push_back(field); + } + auto schema = arrow::schema(fields); + return std::make_shared(std::move(schema)); + } + + std::shared_ptr Schema::unwrap() { + return schema; + } + + void Schema::getFieldByIndex(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + using namespace libmexclass::proxy; + using FieldProxy = arrow::matlab::type::proxy::Field; + mda::ArrayFactory factory; + + mda::StructArray args = context.inputs[0]; + const mda::TypedArray index_mda = args[0]["Index"]; + const auto matlab_index = int32_t(index_mda[0]); + // Note: MATLAB uses 1-based indexing, so subtract 1. + // arrow::Schema::field does not do any bounds checking. + const int32_t index = matlab_index - 1; + const auto num_fields = schema->num_fields(); + + if (num_fields == 0) { + const auto& error = makeEmptySchemaError(); + context.error = error; + return; + } + + if (matlab_index < 1 || matlab_index > num_fields) { + using namespace libmexclass::error; + const std::string& error_message_id = std::string{error::ARROW_TABULAR_SCHEMA_INVALID_NUMERIC_FIELD_INDEX}; + std::stringstream error_message_stream; + error_message_stream << "Invalid field index: "; + error_message_stream << matlab_index; + error_message_stream << ". Field index must be between 1 and the number of fields ("; + error_message_stream << num_fields; + error_message_stream << ")."; + const std::string& error_message = error_message_stream.str(); + context.error = Error{error_message_id, error_message}; + return; + } + + const auto& field = schema->field(index); + auto field_proxy = std::make_shared(field); + const auto field_proxy_id = ProxyManager::manageProxy(field_proxy); + const auto field_proxy_id_mda = factory.createScalar(field_proxy_id); + + context.outputs[0] = field_proxy_id_mda; + } + + void Schema::getFieldByName(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + using namespace libmexclass::proxy; + using FieldProxy = arrow::matlab::type::proxy::Field; + mda::ArrayFactory factory; + + mda::StructArray args = context.inputs[0]; + const mda::StringArray name_mda = args[0]["Name"]; + const auto name_utf16 = std::u16string(name_mda[0]); + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto name, arrow::util::UTF16StringToUTF8(name_utf16), context, error::UNICODE_CONVERSION_ERROR_ID); + const std::vector names = {name}; + MATLAB_ERROR_IF_NOT_OK_WITH_CONTEXT(schema->CanReferenceFieldsByNames(names), context, error::ARROW_TABULAR_SCHEMA_AMBIGUOUS_FIELD_NAME); + + const auto field = schema->GetFieldByName(name); + auto field_proxy = std::make_shared(field); + const auto field_proxy_id = ProxyManager::manageProxy(field_proxy); + const auto field_proxy_id_mda = factory.createScalar(field_proxy_id); + + context.outputs[0] = field_proxy_id_mda; + } + + void Schema::getNumFields(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + mda::ArrayFactory factory; + + const auto num_fields = schema->num_fields(); + const auto num_fields_mda = factory.createScalar(num_fields); + + context.outputs[0] = num_fields_mda; + } + + void Schema::getFieldNames(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + mda::ArrayFactory factory; + + const auto field_names_utf8 = schema->field_names(); + const auto num_fields = static_cast(schema->num_fields()); + + std::vector field_names_utf16; + field_names_utf16.reserve(num_fields); + + // Conver the field names from UTF-8 to UTF-16. + for (const auto& field_name_utf8 : field_names_utf8) { + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto field_name_utf16, arrow::util::UTF8StringToUTF16(field_name_utf8), context, error::UNICODE_CONVERSION_ERROR_ID); + field_names_utf16.push_back(field_name_utf16); + } + + const auto field_names_mda = factory.createArray({1, num_fields}, field_names_utf16.cbegin(), field_names_utf16.cend()); + + context.outputs[0] = field_names_mda; + } + + void Schema::toString(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + mda::ArrayFactory factory; + + const auto str_utf8 = schema->ToString(); + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto str_utf16, arrow::util::UTF8StringToUTF16(str_utf8), context, error::UNICODE_CONVERSION_ERROR_ID); + auto str_mda = factory.createScalar(str_utf16); + context.outputs[0] = str_mda; + } + +} diff --git a/matlab/src/cpp/arrow/matlab/tabular/proxy/schema.h b/matlab/src/cpp/arrow/matlab/tabular/proxy/schema.h new file mode 100644 index 0000000000000..30883bc2a85ac --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/tabular/proxy/schema.h @@ -0,0 +1,47 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +// arrow::Schema is defined in type.h. +#include "arrow/type.h" + +#include "libmexclass/proxy/Proxy.h" + +namespace arrow::matlab::tabular::proxy { + + class Schema : public libmexclass::proxy::Proxy { + public: + Schema(std::shared_ptr Schema); + + virtual ~Schema() {} + + static libmexclass::proxy::MakeResult make(const libmexclass::proxy::FunctionArguments& constructor_arguments); + + std::shared_ptr unwrap(); + + protected: + void getFieldByIndex(libmexclass::proxy::method::Context& context); + void getFieldByName(libmexclass::proxy::method::Context& context); + void getNumFields(libmexclass::proxy::method::Context& context); + void getFieldNames(libmexclass::proxy::method::Context& context); + void toString(libmexclass::proxy::method::Context& context); + + std::shared_ptr schema; + }; + +} diff --git a/matlab/src/matlab/+arrow/+tabular/Schema.m b/matlab/src/matlab/+arrow/+tabular/Schema.m new file mode 100644 index 0000000000000..3d75e6c44a2df --- /dev/null +++ b/matlab/src/matlab/+arrow/+tabular/Schema.m @@ -0,0 +1,96 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef Schema < matlab.mixin.CustomDisplay +%SCHEMA A tabular schema which semantically describes +% the names and types of the columns of an associated tabular +% Arrow data type. + + properties (GetAccess=public, SetAccess=private, Hidden) + Proxy + end + + properties (Dependent, SetAccess=private, GetAccess=public) + % Underlying array of Fields that the Schema wraps. + Fields + % Names of the columns in the associated tabular type. + FieldNames + % Number of fields in the schema + NumFields + end + + methods + + function obj = Schema(proxy) + arguments + proxy(1, 1) libmexclass.proxy.Proxy {validate(proxy, "arrow.tabular.proxy.Schema")} + end + import arrow.internal.proxy.validate + + obj.Proxy = proxy; + end + + function F = field(obj, idx) + idx = convertCharsToStrings(idx); + if ~isempty(idx) && isscalar(idx) && isnumeric(idx) && idx >= 1 + args = struct(Index=int32(idx)); + proxyID = obj.Proxy.getFieldByIndex(args); + elseif isscalar(idx) && isstring(idx) + name = idx; + args = struct(Name=name); + proxyID = obj.Proxy.getFieldByName(args); + else + error("arrow:tabular:schema:UnsupportedFieldIndexType", ... + "Index must be a positive scalar integer or a valid field name."); + end + + proxy = libmexclass.proxy.Proxy(Name="arrow.type.proxy.Field", ID=proxyID); + F = arrow.type.Field(proxy); + end + + function fields = get.Fields(obj) + fields = arrow.type.Field.empty(0, obj.NumFields); + for ii = 1:obj.NumFields + fields(ii) = obj.field(ii); + end + end + + function fieldNames = get.FieldNames(obj) + fieldNames = obj.Proxy.getFieldNames(); + end + + function numFields = get.NumFields(obj) + numFields = obj.Proxy.getNumFields(); + end + + end + + methods (Access = private) + + function str = toString(obj) + str = obj.Proxy.toString(); + end + + end + + methods (Access=protected) + + function displayScalarObject(obj) + disp(obj.toString()); + end + + end + +end diff --git a/matlab/src/matlab/+arrow/schema.m b/matlab/src/matlab/+arrow/schema.m new file mode 100644 index 0000000000000..b61fcabe69f28 --- /dev/null +++ b/matlab/src/matlab/+arrow/schema.m @@ -0,0 +1,34 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +function s = schema(fields) +%SCHEMA Constructs an arrow.tabular.Schema object + arguments + fields(1, :) arrow.type.Field + end + + % Extract the corresponding Proxy IDs from each of the + % supplied arrow.type.Field objects. + numFields = numel(fields); + fieldProxyIDs = zeros(1, numFields, "uint64"); + for ii = 1:numFields + fieldProxyIDs(ii) = fields(ii).Proxy.ID; + end + + % Construct an Arrow Schema Proxy in C++ from the supplied Field Proxy IDs. + args = struct(FieldProxyIDs=fieldProxyIDs); + proxy = arrow.internal.proxy.create("arrow.tabular.proxy.Schema", args); + s = arrow.tabular.Schema(proxy); +end \ No newline at end of file diff --git a/matlab/test/arrow/tabular/tSchema.m b/matlab/test/arrow/tabular/tSchema.m new file mode 100644 index 0000000000000..d550c488fd86d --- /dev/null +++ b/matlab/test/arrow/tabular/tSchema.m @@ -0,0 +1,473 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef tSchema < matlab.unittest.TestCase +% Tests for the arrow.tabular.Schema class and the associated arrow.schema +% construction function. + + methods(Test) + + function ErrorIfUnsupportedInputType(testCase) + % Verify that an error is thrown by arrow.schema if an + % unsupported input argument is supplied. + testCase.verifyError(@() arrow.schema("test"), "MATLAB:validation:UnableToConvert"); + end + + function ErrorIfUnsupportedConstructorInputs(testCase) + % Verify that an error is thrown by the constructor of + % arrow.tabular.Schema if unsupported arguments are passed to + % the constructor. + testCase.verifyError(@() arrow.tabular.Schema("test"), "MATLAB:validation:UnableToConvert"); + end + + function ErrorIfTooFewInputs(testCase) + % Verify that an error is thrown by arrow.schema if too few + % input arguments are supplied. + testCase.verifyError(@() arrow.schema(), "MATLAB:minrhs"); + end + + function ErrorIfTooManyInputs(testCase) + % Verify that an error is thrown by arrow.schema if too many + % input arguments are supplied. + testCase.verifyError(@() arrow.schema("a", "b", "c"), "MATLAB:TooManyInputs"); + end + + function ClassType(testCase) + % Verify that the class type of the object returned by a call + % to arrow.schema is "arrow.tabular.Schema". + schema = arrow.schema(arrow.field("A", arrow.uint8)); + testCase.verifyInstanceOf(schema, "arrow.tabular.Schema"); + end + + function ConstructSchemaFromProxy(testCase) + % Verify that an arrow.tabular.Schema instance can be + % constructred directly from an existing + % arrow.tabular.proxy.Schema Proxy instance. + schema1 = arrow.schema(arrow.field("a", arrow.uint8)); + % Construct an instance of arrow.tabular.Schema directly from a + % Proxy of type "arrow.tabular.proxy.Schema". + schema2 = arrow.tabular.Schema(schema1.Proxy); + testCase.verifyEqual(schema1.FieldNames, schema2.FieldNames); + testCase.verifyEqual(schema1.NumFields, schema2.NumFields); + end + + function Fields(testCase) + % Verify that the Fields property returns an expected array of + % Field objects. + f1 = arrow.field("A", arrow.uint8); + f2 = arrow.field("B", arrow.uint16); + f3 = arrow.field("C", arrow.uint32); + expectedFields = [f1, f2, f3]; + schema = arrow.schema(expectedFields); + + actualFields = schema.Fields; + + testCase.verifyEqual(actualFields(1).Name, expectedFields(1).Name); + testCase.verifyEqual(actualFields(1).Type.ID, expectedFields(1).Type.ID); + testCase.verifyEqual(actualFields(2).Name, expectedFields(2).Name); + testCase.verifyEqual(actualFields(2).Type.ID, expectedFields(2).Type.ID); + testCase.verifyEqual(actualFields(3).Name, expectedFields(3).Name); + testCase.verifyEqual(actualFields(3).Type.ID, expectedFields(3).Type.ID); + end + + function FieldNames(testCase) + % Verify that the FieldNames property returns an expected + % string array of field names. + expectedFieldNames = ["A" , "B" , "C"]; + schema = arrow.schema([... + arrow.field(expectedFieldNames(1), arrow.uint8), ... + arrow.field(expectedFieldNames(2), arrow.uint16), ... + arrow.field(expectedFieldNames(3), arrow.uint32) ... + ]); + actualFieldNames = schema.FieldNames; + testCase.verifyEqual(actualFieldNames, expectedFieldNames); + end + + function FieldNamesNoSetter(testCase) + % Verify that an error is thrown when trying to set the value + % of the FieldNames property. + schema = arrow.schema(arrow.field("A", arrow.uint8)); + testCase.verifyError(@() setfield(schema, "FieldNames", "B"), "MATLAB:class:SetProhibited"); + end + + function NumFieldsNoSetter(testCase) + % Verify than an error is thrown when trying to set the value + % of the NumFields property. + schema = arrow.schema(arrow.field("A", arrow.uint8)); + testCase.verifyError(@() setfield(schema, "NumFields", 123), "MATLAB:class:SetProhibited"); + end + + function FieldsNoSetter(testCase) + % Verify that an error is thrown when trying to set the value + % of the Fields property. + schema = arrow.schema(arrow.field("A", arrow.uint8)); + testCase.verifyError(@() setfield(schema, "Fields", arrow.field("B", arrow.uint8)), "MATLAB:class:SetProhibited"); + end + + function NumFields(testCase) + % Verify that the NumFields property returns an execpted number + % of fields. + schema = arrow.schema([... + arrow.field("A", arrow.uint8), ... + arrow.field("B", arrow.uint16), ... + arrow.field("C", arrow.uint32) ... + ]); + expectedNumFields = int32(3); + actualNumFields = schema.NumFields; + testCase.verifyEqual(actualNumFields, expectedNumFields); + end + + function ErrorIfUnsupportedFieldIndex(testCase) + % Verify that an error is thrown if an invalid field index is + % supplied to the field method (e.g. -1.1, NaN, {1}, etc.). + schema = arrow.schema([... + arrow.field("A", arrow.uint8), ... + arrow.field("B", arrow.uint16), ... + arrow.field("C", arrow.uint32) ... + ]); + + index = []; + testCase.verifyError(@() schema.field(index), "arrow:tabular:schema:UnsupportedFieldIndexType"); + + index = 0; + testCase.verifyError(@() schema.field(index), "arrow:tabular:schema:UnsupportedFieldIndexType"); + + index = -1; + testCase.verifyError(@() schema.field(index), "arrow:tabular:schema:UnsupportedFieldIndexType"); + + index = -1.23; + testCase.verifyError(@() schema.field(index), "arrow:tabular:schema:UnsupportedFieldIndexType"); + + index = NaN; + testCase.verifyError(@() schema.field(index), "arrow:tabular:schema:UnsupportedFieldIndexType"); + + index = {1}; + testCase.verifyError(@() schema.field(index), "arrow:tabular:schema:UnsupportedFieldIndexType"); + + index = [1; 1]; + testCase.verifyError(@() schema.field(index), "arrow:tabular:schema:UnsupportedFieldIndexType"); + end + + function GetFieldByIndex(testCase) + % Verify that Fields can be accessed using a numeric index. + schema = arrow.schema([... + arrow.field("A", arrow.uint8), ... + arrow.field("B", arrow.uint16), ... + arrow.field("C", arrow.uint32) ... + ]); + + field = schema.field(1); + testCase.verifyEqual(field.Name, "A"); + testCase.verifyEqual(field.Type.ID, arrow.type.ID.UInt8); + + field = schema.field(2); + testCase.verifyEqual(field.Name, "B"); + testCase.verifyEqual(field.Type.ID, arrow.type.ID.UInt16); + + field = schema.field(3); + testCase.verifyEqual(field.Name, "C"); + testCase.verifyEqual(field.Type.ID, arrow.type.ID.UInt32); + end + + function GetFieldByName(testCase) + % Verify that Fields can be accessed using a field name. + schema = arrow.schema([... + arrow.field("A", arrow.uint8), ... + arrow.field("B", arrow.uint16), ... + arrow.field("C", arrow.uint32) ... + ]); + + field = schema.field("A"); + testCase.verifyEqual(field.Name, "A"); + testCase.verifyEqual(field.Type.ID, arrow.type.ID.UInt8); + + field = schema.field("B"); + testCase.verifyEqual(field.Name, "B"); + testCase.verifyEqual(field.Type.ID, arrow.type.ID.UInt16); + + field = schema.field("C"); + testCase.verifyEqual(field.Name, "C"); + testCase.verifyEqual(field.Type.ID, arrow.type.ID.UInt32); + end + + function GetFieldByNameWithEmptyString(testCase) + % Verify that a Field whose name is the empty string ("") + % can be accessed using the field() method. + schema = arrow.schema([... + arrow.field("A", arrow.uint8), ... + arrow.field("", arrow.uint16), ... + arrow.field("C", arrow.uint32) ... + ]); + + field = schema.field(""); + + testCase.verifyEqual(field.Name, ""); + testCase.verifyEqual(field.Type.ID, arrow.type.ID.UInt16); + end + + function GetFieldByNameWithWhitespace(testCase) + % Verify that a Field whose name contains only whitespace + % characters can be accessed using the field() method. + schema = arrow.schema([... + arrow.field(" ", arrow.uint8), ... + arrow.field(" ", arrow.uint16), ... + arrow.field(" ", arrow.uint32) ... + ]); + + field = schema.field(" "); + testCase.verifyEqual(field.Name, " "); + testCase.verifyEqual(field.Type.ID, arrow.type.ID.UInt8); + + field = schema.field(" "); + testCase.verifyEqual(field.Name, " "); + testCase.verifyEqual(field.Type.ID, arrow.type.ID.UInt16); + + field = schema.field(" "); + testCase.verifyEqual(field.Name, " "); + testCase.verifyEqual(field.Type.ID, arrow.type.ID.UInt32); + end + + function ErrorIfInvalidNumericFieldIndex(testCase) + % Verify that an error is thrown when trying to access a field + % with an invalid numeric index (e.g. greater than NumFields). + schema = arrow.schema([... + arrow.field("A", arrow.uint8), ... + arrow.field("B", arrow.uint16), ... + arrow.field("C", arrow.uint32) ... + ]); + + % Index is greater than NumFields. + index = 100; + testCase.verifyError(@() schema.field(index), "arrow:tabular:schema:InvalidNumericFieldIndex"); + end + + function ErrorIfFieldNameDoesNotExist(testCase) + % Verify that an error is thrown when trying to access a field + % with a name that is not part of the schema. + schema = arrow.schema([... + arrow.field("A", arrow.uint8), ... + arrow.field("B", arrow.uint16), ... + arrow.field("C", arrow.uint32) ... + ]); + + % Matching should be case sensitive. + fieldName = "a"; + testCase.verifyError(@() schema.field(fieldName), "arrow:tabular:schema:AmbiguousFieldName"); + + fieldName = "aA"; + testCase.verifyError(@() schema.field(fieldName), "arrow:tabular:schema:AmbiguousFieldName"); + + fieldName = "D"; + testCase.verifyError(@() schema.field(fieldName), "arrow:tabular:schema:AmbiguousFieldName"); + + fieldName = ""; + testCase.verifyError(@() schema.field(fieldName), "arrow:tabular:schema:AmbiguousFieldName"); + + fieldName = " "; + testCase.verifyError(@() schema.field(fieldName), "arrow:tabular:schema:AmbiguousFieldName"); + end + + function ErrorIfAmbiguousFieldName(testCase) + % Verify that an error is thrown when trying to access a field + % with a name that is ambiguous / occurs more than once in the + % schema. + schema = arrow.schema([... + arrow.field("A", arrow.uint8), ... + arrow.field("A", arrow.uint16), ... + arrow.field("B", arrow.uint32), ... + arrow.field("B", arrow.uint32) + ]); + + fieldName = "A"; + testCase.verifyError(@() schema.field(fieldName), "arrow:tabular:schema:AmbiguousFieldName"); + + fieldName = "B"; + testCase.verifyError(@() schema.field(fieldName), "arrow:tabular:schema:AmbiguousFieldName"); + end + + function SupportedFieldTypes(testCase) + % Verify that a Schema can be created from Fields with any + % supported Type. + fields = [ ... + arrow.field("A", arrow.uint8), ... + arrow.field("B", arrow.uint16), ... + arrow.field("C", arrow.uint32), ... + arrow.field("D", arrow.uint64), ... + arrow.field("E", arrow.int8), ... + arrow.field("F", arrow.int16), ... + arrow.field("G", arrow.int32), ... + arrow.field("H", arrow.int64), ... + arrow.field("I", arrow.float32), ... + arrow.field("J", arrow.float64), ... + arrow.field("K", arrow.boolean), ... + arrow.field("L", arrow.string), ... + arrow.field("M", arrow.timestamp), ... + ]; + + schema = arrow.schema(fields); + + testCase.verifyEqual(schema.field("A").Type.ID, arrow.type.ID.UInt8); + testCase.verifyEqual(schema.field("B").Type.ID, arrow.type.ID.UInt16); + testCase.verifyEqual(schema.field("C").Type.ID, arrow.type.ID.UInt32); + testCase.verifyEqual(schema.field("D").Type.ID, arrow.type.ID.UInt64); + testCase.verifyEqual(schema.field("E").Type.ID, arrow.type.ID.Int8); + testCase.verifyEqual(schema.field("F").Type.ID, arrow.type.ID.Int16); + testCase.verifyEqual(schema.field("G").Type.ID, arrow.type.ID.Int32); + testCase.verifyEqual(schema.field("H").Type.ID, arrow.type.ID.Int64); + testCase.verifyEqual(schema.field("I").Type.ID, arrow.type.ID.Float32); + testCase.verifyEqual(schema.field("J").Type.ID, arrow.type.ID.Float64); + testCase.verifyEqual(schema.field("K").Type.ID, arrow.type.ID.Boolean); + testCase.verifyEqual(schema.field("L").Type.ID, arrow.type.ID.String); + testCase.verifyEqual(schema.field("M").Type.ID, arrow.type.ID.Timestamp); + end + + function UnicodeFieldNames(testCase) + % Verify that Field names containing Unicode characters are + % preserved with the FieldNames property. + smiley = "😀"; + tree = "🌲"; + mango = "🥭"; + expectedFieldNames = [smiley, tree, mango]; + + f1 = arrow.field(expectedFieldNames(1), arrow.uint8); + f2 = arrow.field(expectedFieldNames(2), arrow.uint16); + f3 = arrow.field(expectedFieldNames(3), arrow.uint32); + fields = [f1, f2, f3]; + + schema = arrow.schema(fields); + + actualFieldNames = schema.FieldNames; + + testCase.verifyEqual(actualFieldNames, expectedFieldNames); + end + + function EmptyFieldNames(testCase) + % Verify that Field names which are the empty string are + % preserved with the FieldNames property. + expectedFieldNames = ["", "B", "C"]; + schema = arrow.schema([... + arrow.field("", arrow.uint8), ... + arrow.field("B", arrow.uint16), ... + arrow.field("C", arrow.uint32) + ]); + actualFieldNames = schema.FieldNames; + testCase.verifyEqual(actualFieldNames, expectedFieldNames); + end + + function EmptySchema(testCase) + % Verify that a Schema with no Fields can be created. + + % 0x0 empty Field array. + fields = arrow.type.Field.empty(0, 0); + schema = arrow.schema(fields); + testCase.verifyEqual(schema.NumFields, int32(0)); + testCase.verifyEqual(schema.FieldNames, string.empty(1, 0)); + testCase.verifyEqual(schema.Fields, arrow.type.Field.empty(0, 0)); + testCase.verifyError(@() schema.field(0), "arrow:tabular:schema:UnsupportedFieldIndexType"); + testCase.verifyError(@() schema.field(1), "arrow:tabular:schema:NumericFieldIndexWithEmptySchema"); + + % 0x1 empty Field array. + fields = arrow.type.Field.empty(0, 1); + schema = arrow.schema(fields); + testCase.verifyEqual(schema.NumFields, int32(0)); + testCase.verifyEqual(schema.FieldNames, string.empty(1, 0)); + testCase.verifyEqual(schema.Fields, arrow.type.Field.empty(0, 0)); + testCase.verifyError(@() schema.field(0), "arrow:tabular:schema:UnsupportedFieldIndexType"); + testCase.verifyError(@() schema.field(1), "arrow:tabular:schema:NumericFieldIndexWithEmptySchema"); + + % 1x0 empty Field array. + fields = arrow.type.Field.empty(1, 0); + schema = arrow.schema(fields); + testCase.verifyEqual(schema.NumFields, int32(0)); + testCase.verifyEqual(schema.FieldNames, string.empty(1, 0)); + testCase.verifyEqual(schema.Fields, arrow.type.Field.empty(0, 0)); + testCase.verifyError(@() schema.field(0), "arrow:tabular:schema:UnsupportedFieldIndexType"); + testCase.verifyError(@() schema.field(1), "arrow:tabular:schema:NumericFieldIndexWithEmptySchema"); + end + + function GetFieldByNameWithChar(testCase) + % Verify that the field method works when supplied a char + % vector as input. + schema = arrow.schema([... + arrow.field("", arrow.uint8), ... + arrow.field("B", arrow.uint16), ... + arrow.field("123", arrow.uint32) + ]); + + % Should match the first field whose name is the + % empty string (""). + fieldName = char.empty(0, 0); + field = schema.field(fieldName); + testCase.verifyEqual(field.Name, ""); + testCase.verifyEqual(field.Type.ID, arrow.type.ID.UInt8); + + fieldName = char.empty(0, 1); + field = schema.field(fieldName); + testCase.verifyEqual(field.Name, ""); + testCase.verifyEqual(field.Type.ID, arrow.type.ID.UInt8); + + fieldName = char.empty(1, 0); + field = schema.field(fieldName); + testCase.verifyEqual(field.Name, ""); + testCase.verifyEqual(field.Type.ID, arrow.type.ID.UInt8); + + % Should match the second field whose name is "B". + fieldName = 'B'; + field = schema.field(fieldName); + testCase.verifyEqual(field.Name, "B"); + testCase.verifyEqual(field.Type.ID, arrow.type.ID.UInt16); + + % Should match the second field whose name is "123". + fieldName = '123'; + field = schema.field(fieldName); + testCase.verifyEqual(field.Name, "123"); + testCase.verifyEqual(field.Type.ID, arrow.type.ID.UInt32); + end + + function ErrorIfNumericIndexIsNonScalar(testCase) + % Verify that an error is thrown if a nonscalar numeric index + % is supplied to the field method. + schema = arrow.schema([... + arrow.field("", arrow.uint8), ... + arrow.field("B", arrow.uint16), ... + arrow.field("123", arrow.uint32) + ]); + + fieldName = [1, 2, 3]; + testCase.verifyError(@() schema.field(fieldName), "arrow:tabular:schema:UnsupportedFieldIndexType"); + + fieldName = [1; 2; 3]; + testCase.verifyError(@() schema.field(fieldName), "arrow:tabular:schema:UnsupportedFieldIndexType"); + end + + function ErrorIfFieldNameIsNonScalar(testCase) + % Verify that an error is thrown if a nonscalar string array is + % specified as a field name to the field method. + schema = arrow.schema([... + arrow.field("", arrow.uint8), ... + arrow.field("B", arrow.uint16), ... + arrow.field("123", arrow.uint32) + ]); + + fieldName = ["A", "B", "C"]; + testCase.verifyError(@() schema.field(fieldName), "arrow:tabular:schema:UnsupportedFieldIndexType"); + + fieldName = ["A"; "B"; "C"]; + testCase.verifyError(@() schema.field(fieldName), "arrow:tabular:schema:UnsupportedFieldIndexType"); + end + + end + +end diff --git a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake index 530799c15c172..a316a27e55d2b 100644 --- a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake +++ b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake @@ -45,6 +45,7 @@ set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_SOURCES "${CMAKE_SOURCE_DIR}/src/cpp/a "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/array/proxy/boolean_array.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/array/proxy/string_array.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/tabular/proxy/record_batch.cc" + "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/tabular/proxy/schema.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/bit/pack.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/bit/unpack.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/time_unit.cc" From 6dc6af48cc92640fabdc3662f2b87193c4cb6fe2 Mon Sep 17 00:00:00 2001 From: sgilmore10 <74676073+sgilmore10@users.noreply.github.com> Date: Fri, 4 Aug 2023 14:44:15 -0400 Subject: [PATCH 092/749] GH-37012: [MATLAB] Remove the private property `ArrowArrays` from `arrow.tabular.RecordBatch` (#37015) ### Rationale for this change In our initial implementation of the MATLAB class `arrow.tabular.RecordBatch`, we included a property property called `ArrowArrays`, which is a `cell` array whose elements are scalar `arrow.array.Array` objects. These arrays correspond to the columns in the `arrow::RecordBatch` that the MATLAB class wraps. The purpose of `ArrowArrays` was to enable zero-copy construction of the `arrow::Array` objects backing the `arrow::RecordBatch` from MATLAB arrays. The `ArrowArrays` property was necessary to ensure the MATLAB data from which the `arrow::Array` columns were constructed don't get deallocated before they are done being used. However, we no longer need the `ArrowArrays` property on `arrow.tabular.RecordBatch` because of #36615, in which we implemented `arrow::matlab::buffer::MatlabBuffer`. This class inherits from `arrow::Buffer` and stores a reference to the MATLAB data it wraps, ensuring that the wrapped MATLAB data is kept alive as long as the buffer is around. We now only create `arrow::Array` objects from `arrow::matlab::buffer::MatlabBuffer` objects - instead of `arrow::Buffer` objects. As a result, the `ArrowArrays` property is no longer necessary on `arrow.tabular.RecordBatch` because the `arrow::Array` columns within the `arrow::RecordBatch` are all backed by `arrow::matlab::buffer::MatlabBuffer` objects. The backing MATLAB data is kept alive as long as the arrays and their buffers are kept alive. ### What changes are included in this PR? 1. Removed the `ArrowArrays` property from `arrow.tabular.RecordBatch`. 2. Because `ArrowArrays` is no longer a property, we had to add a new method to `arrow::matlab::tabular::proxy::RecordBatch` called `getColumnByIndex()`. This method creates a proxy object around the specified `arrrow::Array`. The MATLAB method `RecordBatch/column(index)` uses `getColumnByIndex()`. 3. Added a new function called `wrap`, which accepts an `arrow::Array` and returns an `arrow::matlab::array::proxy::Array`. While working on `getColumnByIndex()`, we realized there would be multiple places in the interface where we will need to create proxies around `arrow::Array` objects. We wrote the `wrap` utility function to reduce duplicated code in the future. Currently, only `getColumnByIndex()` utilizes `wrap`. ### Are these changes tested? Yes. 1. The existing unit tests in `tRecordBatch.m` cover these changes. 2. Added a few more test cases to `tRecordBatch.m` to cover error conditions when a bad `index` value is provided to `arrow.tabular.RecordBatch/column(index)`. ### Are there any user-facing changes? No. ### Future Directions 1. In a followup PR, we will add support for indexing columns by names (similar to `Schema.field(fieldName)` - #37013). 2. We also plan on adding an index validation function within the `arrow.internal.validate` package. 3. We also plan on adding a convenience constructor `arrow.recordbatch()` and changing the constructor of `arrow.tabular.RecordBatch` to expect a `libmexclass.proxy.Proxy` object. * Closes: #37012 Authored-by: Sarah Gilmore Signed-off-by: Kevin Gurney --- .../arrow/matlab/array/proxy/numeric_array.h | 2 +- .../src/cpp/arrow/matlab/array/proxy/wrap.cc | 60 ++++++++++++++++++ .../src/cpp/arrow/matlab/array/proxy/wrap.h | 29 +++++++++ matlab/src/cpp/arrow/matlab/error/error.h | 3 + .../matlab/tabular/proxy/record_batch.cc | 63 +++++++++++++++++++ .../arrow/matlab/tabular/proxy/record_batch.h | 3 +- matlab/src/cpp/arrow/matlab/type/proxy/wrap.h | 2 + .../src/matlab/+arrow/+tabular/RecordBatch.m | 28 ++++++--- matlab/test/arrow/tabular/tRecordBatch.m | 23 ++++++- .../cmake/BuildMatlabArrowInterface.cmake | 1 + 10 files changed, 201 insertions(+), 13 deletions(-) create mode 100644 matlab/src/cpp/arrow/matlab/array/proxy/wrap.cc create mode 100644 matlab/src/cpp/arrow/matlab/array/proxy/wrap.h diff --git a/matlab/src/cpp/arrow/matlab/array/proxy/numeric_array.h b/matlab/src/cpp/arrow/matlab/array/proxy/numeric_array.h index 6893079c78b95..6bfdad1f5db55 100644 --- a/matlab/src/cpp/arrow/matlab/array/proxy/numeric_array.h +++ b/matlab/src/cpp/arrow/matlab/array/proxy/numeric_array.h @@ -89,7 +89,7 @@ class NumericArray : public arrow::matlab::array::proxy::Array { // Specialization of NumericArray::Make for arrow::TimestampType. template <> - libmexclass::proxy::MakeResult NumericArray::make(const libmexclass::proxy::FunctionArguments& constructor_arguments) { + inline libmexclass::proxy::MakeResult NumericArray::make(const libmexclass::proxy::FunctionArguments& constructor_arguments) { namespace mda = ::matlab::data; using MatlabBuffer = arrow::matlab::buffer::MatlabBuffer; using TimestampArray = arrow::TimestampArray; diff --git a/matlab/src/cpp/arrow/matlab/array/proxy/wrap.cc b/matlab/src/cpp/arrow/matlab/array/proxy/wrap.cc new file mode 100644 index 0000000000000..dab09359598d4 --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/array/proxy/wrap.cc @@ -0,0 +1,60 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +#include "arrow/matlab/array/proxy/wrap.h" +#include "arrow/matlab/array/proxy/array.h" +#include "arrow/matlab/array/proxy/boolean_array.h" +#include "arrow/matlab/array/proxy/numeric_array.h" +#include "arrow/matlab/array/proxy/string_array.h" + +namespace arrow::matlab::array::proxy { + + arrow::Result> wrap(const std::shared_ptr& array) { + using ID = arrow::Type::type; + switch (array->type_id()) { + case ID::BOOL: + return std::make_shared(std::static_pointer_cast(array)); + case ID::UINT8: + return std::make_shared>(std::static_pointer_cast(array)); + case ID::UINT16: + return std::make_shared>(std::static_pointer_cast(array)); + case ID::UINT32: + return std::make_shared>(std::static_pointer_cast(array)); + case ID::UINT64: + return std::make_shared>(std::static_pointer_cast(array)); + case ID::INT8: + return std::make_shared>(std::static_pointer_cast(array)); + case ID::INT16: + return std::make_shared>(std::static_pointer_cast(array)); + case ID::INT32: + return std::make_shared>(std::static_pointer_cast(array)); + case ID::INT64: + return std::make_shared>(std::static_pointer_cast(array)); + case ID::FLOAT: + return std::make_shared>(std::static_pointer_cast(array)); + case ID::DOUBLE: + return std::make_shared>(std::static_pointer_cast(array)); + case ID::TIMESTAMP: + return std::make_shared>(std::static_pointer_cast(array)); + case ID::STRING: + return std::make_shared(std::static_pointer_cast(array)); + default: + return arrow::Status::NotImplemented("Unsupported DataType: " + array->type()->ToString()); + } + } +} \ No newline at end of file diff --git a/matlab/src/cpp/arrow/matlab/array/proxy/wrap.h b/matlab/src/cpp/arrow/matlab/array/proxy/wrap.h new file mode 100644 index 0000000000000..5ccb498f7689a --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/array/proxy/wrap.h @@ -0,0 +1,29 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/array.h" +#include "arrow/result.h" + +#include "arrow/matlab/array/proxy/array.h" + +namespace arrow::matlab::array::proxy { + + arrow::Result> wrap(const std::shared_ptr& array); + +} diff --git a/matlab/src/cpp/arrow/matlab/error/error.h b/matlab/src/cpp/arrow/matlab/error/error.h index 590cccfb9d6bf..b7c0d7d696d65 100644 --- a/matlab/src/cpp/arrow/matlab/error/error.h +++ b/matlab/src/cpp/arrow/matlab/error/error.h @@ -177,4 +177,7 @@ namespace arrow::matlab::error { static const char* ARROW_TABULAR_SCHEMA_UNKNOWN_FIELD_NAME = "arrow:tabular:schema:UnknownFieldName"; static const char* ARROW_TABULAR_SCHEMA_AMBIGUOUS_FIELD_NAME = "arrow:tabular:schema:AmbiguousFieldName"; static const char* ARROW_TABULAR_SCHEMA_NUMERIC_FIELD_INDEX_WITH_EMPTY_SCHEMA = "arrow:tabular:schema:NumericFieldIndexWithEmptySchema"; + static const char* UNKNOWN_PROXY_FOR_ARRAY_TYPE = "arrow:array:UnknownProxyForArrayType"; + static const char* RECORD_BATCH_NUMERIC_INDEX_WITH_EMPTY_RECORD_BATCH = "arrow:tabular:recordbatch:NumericIndexWithEmptyRecordBatch"; + static const char* RECORD_BATCH_INVALID_NUMERIC_COLUMN_INDEX = "arrow:tabular:recordbatch:InvalidNumericColumnIndex"; } diff --git a/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.cc b/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.cc index c0b73833e5a0f..ed30472f6c4a2 100644 --- a/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.cc +++ b/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.cc @@ -18,17 +18,42 @@ #include "libmexclass/proxy/ProxyManager.h" #include "arrow/matlab/array/proxy/array.h" +#include "arrow/matlab/array/proxy/wrap.h" + #include "arrow/matlab/error/error.h" #include "arrow/matlab/tabular/proxy/record_batch.h" #include "arrow/type.h" #include "arrow/util/utf8.h" +#include "libmexclass/proxy/ProxyManager.h" +#include "libmexclass/error/Error.h" + +#include + namespace arrow::matlab::tabular::proxy { + namespace { + libmexclass::error::Error makeEmptyRecordBatchError() { + const std::string error_msg = "Numeric indexing using the column method is not supported for record batches with no columns."; + return libmexclass::error::Error{error::RECORD_BATCH_NUMERIC_INDEX_WITH_EMPTY_RECORD_BATCH, error_msg}; + } + + libmexclass::error::Error makeInvalidNumericIndexError(const int32_t matlab_index, const int32_t num_columns) { + std::stringstream error_message_stream; + error_message_stream << "Invalid column index: "; + error_message_stream << matlab_index; + error_message_stream << ". Column index must be between 1 and the number of columns ("; + error_message_stream << num_columns; + error_message_stream << ")."; + return libmexclass::error::Error{error::RECORD_BATCH_INVALID_NUMERIC_COLUMN_INDEX, error_message_stream.str()}; + } + } + RecordBatch::RecordBatch(std::shared_ptr record_batch) : record_batch{record_batch} { REGISTER_METHOD(RecordBatch, toString); REGISTER_METHOD(RecordBatch, numColumns); REGISTER_METHOD(RecordBatch, columnNames); + REGISTER_METHOD(RecordBatch, getColumnByIndex); } void RecordBatch::toString(libmexclass::proxy::method::Context& context) { @@ -96,4 +121,42 @@ namespace arrow::matlab::tabular::proxy { context.outputs[0] = column_names_mda; } + void RecordBatch::getColumnByIndex(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + using namespace libmexclass::proxy; + mda::ArrayFactory factory; + + mda::StructArray args = context.inputs[0]; + const mda::TypedArray index_mda = args[0]["Index"]; + const auto matlab_index = int32_t(index_mda[0]); + + // Note: MATLAB uses 1-based indexing, so subtract 1. + // arrow::Schema::field does not do any bounds checking. + const int32_t index = matlab_index - 1; + const auto num_columns = record_batch->num_columns(); + + if (num_columns == 0) { + context.error = makeEmptyRecordBatchError(); + return; + } + + if (matlab_index < 1 || matlab_index > num_columns) { + context.error = makeInvalidNumericIndexError(matlab_index, num_columns); + return; + } + + const auto array = record_batch->column(index); + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto array_proxy, + arrow::matlab::array::proxy::wrap(array), + context, + error::UNKNOWN_PROXY_FOR_ARRAY_TYPE); + + + const auto array_proxy_id = ProxyManager::manageProxy(array_proxy); + const auto array_proxy_id_mda = factory.createScalar(array_proxy_id); + const auto array_type_id_mda = factory.createScalar(static_cast(array->type_id())); + + context.outputs[0] = array_proxy_id_mda; + context.outputs[1] = array_type_id_mda; + } } diff --git a/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.h b/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.h index 9561080d02276..b5d741060a15d 100644 --- a/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.h +++ b/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.h @@ -35,7 +35,8 @@ namespace arrow::matlab::tabular::proxy { void toString(libmexclass::proxy::method::Context& context); void numColumns(libmexclass::proxy::method::Context& context); void columnNames(libmexclass::proxy::method::Context& context); - + void getColumnByIndex(libmexclass::proxy::method::Context& context); + std::shared_ptr record_batch; }; diff --git a/matlab/src/cpp/arrow/matlab/type/proxy/wrap.h b/matlab/src/cpp/arrow/matlab/type/proxy/wrap.h index f5e2d30f8f4ec..f93240757c4c1 100644 --- a/matlab/src/cpp/arrow/matlab/type/proxy/wrap.h +++ b/matlab/src/cpp/arrow/matlab/type/proxy/wrap.h @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +#pragma once + #include "arrow/type.h" #include "arrow/result.h" diff --git a/matlab/src/matlab/+arrow/+tabular/RecordBatch.m b/matlab/src/matlab/+arrow/+tabular/RecordBatch.m index 9af09702e1cf5..796f2f0c7a645 100644 --- a/matlab/src/matlab/+arrow/+tabular/RecordBatch.m +++ b/matlab/src/matlab/+arrow/+tabular/RecordBatch.m @@ -18,10 +18,6 @@ %arrow.tabular.RecordBatch A tabular data structure representing % a set of arrow.array.Array objects with a fixed schema. - properties (Access=private) - ArrowArrays = {}; - end - properties (Dependent, SetAccess=private, GetAccess=public) NumColumns ColumnNames @@ -42,23 +38,35 @@ end function arrowArray = column(obj, idx) - arrowArray = obj.ArrowArrays{idx}; + if ~isempty(idx) && isscalar(idx) && isnumeric(idx) && idx >= 1 + args = struct(Index=int32(idx)); + [proxyID, typeID] = obj.Proxy.getColumnByIndex(args); + traits = arrow.type.traits.traits(arrow.type.ID(typeID)); + proxy = libmexclass.proxy.Proxy(Name=traits.ArrayProxyClassName, ID=proxyID); + arrowArray = traits.ArrayConstructor(proxy); + else + errid = "arrow:tabular:recordbatch:UnsupportedColumnIndexType"; + msg = "Index must be a positive scalar integer."; + error(errid, msg); + end end function obj = RecordBatch(T) - obj.ArrowArrays = arrow.tabular.RecordBatch.decompose(T); + arrowArrays = arrow.tabular.RecordBatch.decompose(T); columnNames = string(T.Properties.VariableNames); - arrayProxyIDs = arrow.tabular.RecordBatch.getArrowProxyIDs(obj.ArrowArrays); + arrayProxyIDs = arrow.tabular.RecordBatch.getArrowProxyIDs(arrowArrays); opts = struct("ArrayProxyIDs", arrayProxyIDs, ... "ColumnNames", columnNames); obj.Proxy = libmexclass.proxy.Proxy("Name", "arrow.tabular.proxy.RecordBatch", "ConstructorArguments", {opts}); end function T = table(obj) - matlabArrays = cell(1, numel(obj.ArrowArrays)); + numColumns = obj.NumColumns; + matlabArrays = cell(1, numColumns); - for ii = 1:numel(obj.ArrowArrays) - matlabArrays{ii} = toMATLAB(obj.ArrowArrays{ii}); + for ii = 1:numColumns + arrowArray = obj.column(ii); + matlabArrays{ii} = toMATLAB(arrowArray); end variableNames = matlab.lang.makeUniqueStrings(obj.ColumnNames); diff --git a/matlab/test/arrow/tabular/tRecordBatch.m b/matlab/test/arrow/tabular/tRecordBatch.m index 89175c43dad7a..78bfe7c090697 100644 --- a/matlab/test/arrow/tabular/tRecordBatch.m +++ b/matlab/test/arrow/tabular/tRecordBatch.m @@ -46,6 +46,8 @@ function SupportedTypes(tc) for ii = 1:arrowRecordBatch.NumColumns column = arrowRecordBatch.column(ii); tc.verifyEqual(column.toMATLAB(), TOriginal{:, ii}); + traits = arrow.type.traits.traits(string(class(TOriginal{:, ii}))); + tc.verifyInstanceOf(column, traits.ArrayClassName); end end @@ -99,6 +101,25 @@ function EmptyTable(tc) tc.verifyEqual(TOriginal, TConverted); end - end + function EmptyRecordBatchColumnIndexError(tc) + TOriginal = table(); + arrowRecordBatch = arrow.tabular.RecordBatch(TOriginal); + fcn = @() arrowRecordBatch.column(1); + tc.verifyError(fcn, "arrow:tabular:recordbatch:NumericIndexWithEmptyRecordBatch"); + end + + function InvalidNumericIndexError(tc) + TOriginal = table(1, 2, 3); + arrowRecordBatch = arrow.tabular.RecordBatch(TOriginal); + fcn = @() arrowRecordBatch.column(4); + tc.verifyError(fcn, "arrow:tabular:recordbatch:InvalidNumericColumnIndex"); + end + function UnsupportedColumnIndexType(tc) + TOriginal = table(1, 2, 3); + arrowRecordBatch = arrow.tabular.RecordBatch(TOriginal); + fcn = @() arrowRecordBatch.column(datetime(2022, 1, 3)); + tc.verifyError(fcn, "arrow:tabular:recordbatch:UnsupportedColumnIndexType"); + end + end end diff --git a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake index a316a27e55d2b..f4696cfad26ee 100644 --- a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake +++ b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake @@ -44,6 +44,7 @@ set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/src/c set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_SOURCES "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/array/proxy/array.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/array/proxy/boolean_array.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/array/proxy/string_array.cc" + "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/array/proxy/wrap.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/tabular/proxy/record_batch.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/tabular/proxy/schema.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/bit/pack.cc" From 19d0daf40c2db5164a6bfefc91019253d8db7157 Mon Sep 17 00:00:00 2001 From: sgilmore10 <74676073+sgilmore10@users.noreply.github.com> Date: Fri, 4 Aug 2023 15:19:57 -0400 Subject: [PATCH 093/749] GH-36984: [MATLAB] Create `arrow.recordbatch` convenience constructor function (#37025) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change For parity with how `arrow.array.Array` objects are constructed (i.e. via `arrow.array`), we should create a construction function called `arrow.recordbatch()` for `arrow.tabular.RecordBatch`: ```matlab >> t = table(["A"; "B"; "C"], [1; 2; 3]) t = 3×2 table Var1 Var2 ____ ____ "A" 1 "B" 2 "C" 3 >> rb = arrow.recordbatch(t) rb = Var1: [ "A", "B", "C" ] Var2: [ 1, 2, 3 ] >> class(rb) ans = 'arrow.tabular.RecordBatch' ``` The `arrow.tabular.RecordBatch` constructor will accept a scalar `libmexclass.proxy.Proxy` object instead of a MATLAB `table` (although, client code is not expected to call the constructor directly - similar to the other classes). ### What changes are included in this PR? 1. Added the convenience constructor function `arrow.recordbatch()`. It accepts a MATLAB `table` as input. 2. Modified `arrow.tabular.RecordBatch`'s constructor to expect a scalar `libmexclass.proxy.Proxy` object as input instead of a `table`. ### Are these changes tested? Yes, updated the test cases in `tRecordBatch.m` to use the new convenience constructor `arrow.recordbatch()` . ### Are there any user-facing changes? Yes, users are now encouraged to use `arrow.recordbatch()` when constructing a record batch from a MATLAB `table` instead of `arrow.tabular.RecordBatch`'s constructor. * Closes: #36984 Authored-by: Sarah Gilmore Signed-off-by: Kevin Gurney --- .../+arrow/+tabular/+internal/decompose.m | 27 ++++++++++ .../+tabular/+internal/getArrayProxyIDs.m | 26 +++++++++ .../src/matlab/+arrow/+tabular/RecordBatch.m | 53 +++---------------- matlab/src/matlab/+arrow/recordbatch.m | 31 +++++++++++ matlab/test/arrow/tabular/tRecordBatch.m | 22 ++++---- 5 files changed, 102 insertions(+), 57 deletions(-) create mode 100644 matlab/src/matlab/+arrow/+tabular/+internal/decompose.m create mode 100644 matlab/src/matlab/+arrow/+tabular/+internal/getArrayProxyIDs.m create mode 100644 matlab/src/matlab/+arrow/recordbatch.m diff --git a/matlab/src/matlab/+arrow/+tabular/+internal/decompose.m b/matlab/src/matlab/+arrow/+tabular/+internal/decompose.m new file mode 100644 index 0000000000000..2d8ec581304bd --- /dev/null +++ b/matlab/src/matlab/+arrow/+tabular/+internal/decompose.m @@ -0,0 +1,27 @@ +%DECOMPOSE Decompose the input MATLAB table input a cell array of +% equivalent arrow.array.Array instances. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. +function arrowArrays = decompose(T) + numColumns = width(T); + arrowArrays = cell(1, numColumns); + + % Convert each MATLAB array into a corresponding + % arrow.array.Array. + for ii = 1:numColumns + arrowArrays{ii} = arrow.array(T{:, ii}); + end +end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+tabular/+internal/getArrayProxyIDs.m b/matlab/src/matlab/+arrow/+tabular/+internal/getArrayProxyIDs.m new file mode 100644 index 0000000000000..8a1f88158953c --- /dev/null +++ b/matlab/src/matlab/+arrow/+tabular/+internal/getArrayProxyIDs.m @@ -0,0 +1,26 @@ +%GETARRAYPROXYIDS Extract the Proxy IDs underlying a cell array of +% arrow.array.Array instances. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. +function proxyIDs = getArrayProxyIDs(arrowArrays) + proxyIDs = zeros(1, numel(arrowArrays), "uint64"); + + % Convert each MATLAB array into a corresponding + % arrow.array.Array. + for ii = 1:numel(arrowArrays) + proxyIDs(ii) = arrowArrays{ii}.Proxy.ID; + end +end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+tabular/RecordBatch.m b/matlab/src/matlab/+arrow/+tabular/RecordBatch.m index 796f2f0c7a645..0d002797f0121 100644 --- a/matlab/src/matlab/+arrow/+tabular/RecordBatch.m +++ b/matlab/src/matlab/+arrow/+tabular/RecordBatch.m @@ -28,6 +28,13 @@ end methods + function obj = RecordBatch(proxy) + arguments + proxy(1, 1) libmexclass.proxy.Proxy {validate(proxy, "arrow.tabular.proxy.RecordBatch")} + end + import arrow.internal.proxy.validate + obj.Proxy = proxy; + end function numColumns = get.NumColumns(obj) numColumns = obj.Proxy.numColumns(); @@ -51,15 +58,6 @@ end end - function obj = RecordBatch(T) - arrowArrays = arrow.tabular.RecordBatch.decompose(T); - columnNames = string(T.Properties.VariableNames); - arrayProxyIDs = arrow.tabular.RecordBatch.getArrowProxyIDs(arrowArrays); - opts = struct("ArrayProxyIDs", arrayProxyIDs, ... - "ColumnNames", columnNames); - obj.Proxy = libmexclass.proxy.Proxy("Name", "arrow.tabular.proxy.RecordBatch", "ConstructorArguments", {opts}); - end - function T = table(obj) numColumns = obj.NumColumns; matlabArrays = cell(1, numColumns); @@ -78,41 +76,6 @@ function T = toMATLAB(obj) T = obj.table(); end - - end - - methods (Static) - - function arrowArrays = decompose(T) - % Decompose the input MATLAB table - % input a cell array of equivalent arrow.array.Array - % instances. - arguments - T table - end - - numColumns = width(T); - arrowArrays = cell(1, numColumns); - - % Convert each MATLAB array into a corresponding - % arrow.array.Array. - for ii = 1:numColumns - arrowArrays{ii} = arrow.array(T{:, ii}); - end - end - - function proxyIDs = getArrowProxyIDs(arrowArrays) - % Extract the Proxy IDs underlying a cell array of - % arrow.array.Array instances. - proxyIDs = zeros(1, numel(arrowArrays), "uint64"); - - % Convert each MATLAB array into a corresponding - % arrow.array.Array. - for ii = 1:numel(arrowArrays) - proxyIDs(ii) = arrowArrays{ii}.Proxy.ID; - end - end - end methods (Access = private) @@ -126,6 +89,4 @@ function displayScalarObject(obj) disp(obj.toString()); end end - end - diff --git a/matlab/src/matlab/+arrow/recordbatch.m b/matlab/src/matlab/+arrow/recordbatch.m new file mode 100644 index 0000000000000..1fb25347f1a7d --- /dev/null +++ b/matlab/src/matlab/+arrow/recordbatch.m @@ -0,0 +1,31 @@ +%RECORDBATCH Creates an arrow.tabular.RecordBatch from a table. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. +function rb = recordbatch(T) + arguments + T table + end + + arrowArrays = arrow.tabular.internal.decompose(T); + arrayProxyIDs = arrow.tabular.internal.getArrayProxyIDs(arrowArrays); + + columnNames = string(T.Properties.VariableNames); + args = struct(ArrayProxyIDs=arrayProxyIDs, ColumnNames=columnNames); + proxyName = "arrow.tabular.proxy.RecordBatch"; + proxy = arrow.internal.proxy.create(proxyName, args); + + rb = arrow.tabular.RecordBatch(proxy); +end diff --git a/matlab/test/arrow/tabular/tRecordBatch.m b/matlab/test/arrow/tabular/tRecordBatch.m index 78bfe7c090697..025f15320ade4 100644 --- a/matlab/test/arrow/tabular/tRecordBatch.m +++ b/matlab/test/arrow/tabular/tRecordBatch.m @@ -20,7 +20,7 @@ function Basic(tc) T = table([1, 2, 3]'); - arrowRecordBatch = arrow.tabular.RecordBatch(T); + arrowRecordBatch = arrow.recordbatch(T); className = string(class(arrowRecordBatch)); tc.verifyEqual(className, "arrow.tabular.RecordBatch"); end @@ -40,7 +40,7 @@ function SupportedTypes(tc) double ([1, 2, 3]'), ... string (["A", "B", "C"]'), ... datetime(2023, 6, 28) + days(0:2)'); - arrowRecordBatch = arrow.tabular.RecordBatch(TOriginal); + arrowRecordBatch = arrow.recordbatch(TOriginal); TConverted = arrowRecordBatch.toMATLAB(); tc.verifyEqual(TOriginal, TConverted); for ii = 1:arrowRecordBatch.NumColumns @@ -53,14 +53,14 @@ function SupportedTypes(tc) function ToMATLAB(tc) TOriginal = table([1, 2, 3]'); - arrowRecordBatch = arrow.tabular.RecordBatch(TOriginal); + arrowRecordBatch = arrow.recordbatch(TOriginal); TConverted = arrowRecordBatch.toMATLAB(); tc.verifyEqual(TOriginal, TConverted); end function Table(tc) TOriginal = table([1, 2, 3]'); - arrowRecordBatch = arrow.tabular.RecordBatch(TOriginal); + arrowRecordBatch = arrow.recordbatch(TOriginal); TConverted = table(arrowRecordBatch); tc.verifyEqual(TOriginal, TConverted); end @@ -68,7 +68,7 @@ function Table(tc) function ColumnNames(tc) columnNames = ["A", "B", "C"]; TOriginal = table(1, 2, 3, VariableNames=columnNames); - arrowRecordBatch = arrow.tabular.RecordBatch(TOriginal); + arrowRecordBatch = arrow.recordbatch(TOriginal); tc.verifyEqual(arrowRecordBatch.ColumnNames, columnNames); end @@ -77,7 +77,7 @@ function NumColumns(tc) for nc = numColumns T = array2table(ones(1, nc)); - arrowRecordBatch = arrow.tabular.RecordBatch(T); + arrowRecordBatch = arrow.recordbatch(T); tc.verifyEqual(arrowRecordBatch.NumColumns, nc); end end @@ -88,7 +88,7 @@ function UnicodeColumnNames(tc) mango = "🥭"; columnNames = [smiley, tree, mango]; TOriginal = table(1, 2, 3, VariableNames=columnNames); - arrowRecordBatch = arrow.tabular.RecordBatch(TOriginal); + arrowRecordBatch = arrow.recordbatch(TOriginal); tc.verifyEqual(arrowRecordBatch.ColumnNames, columnNames); TConverted = arrowRecordBatch.toMATLAB(); tc.verifyEqual(TOriginal, TConverted); @@ -96,28 +96,28 @@ function UnicodeColumnNames(tc) function EmptyTable(tc) TOriginal = table(); - arrowRecordBatch = arrow.tabular.RecordBatch(TOriginal); + arrowRecordBatch = arrow.recordbatch(TOriginal); TConverted = arrowRecordBatch.toMATLAB(); tc.verifyEqual(TOriginal, TConverted); end function EmptyRecordBatchColumnIndexError(tc) TOriginal = table(); - arrowRecordBatch = arrow.tabular.RecordBatch(TOriginal); + arrowRecordBatch = arrow.recordbatch(TOriginal); fcn = @() arrowRecordBatch.column(1); tc.verifyError(fcn, "arrow:tabular:recordbatch:NumericIndexWithEmptyRecordBatch"); end function InvalidNumericIndexError(tc) TOriginal = table(1, 2, 3); - arrowRecordBatch = arrow.tabular.RecordBatch(TOriginal); + arrowRecordBatch = arrow.recordbatch(TOriginal); fcn = @() arrowRecordBatch.column(4); tc.verifyError(fcn, "arrow:tabular:recordbatch:InvalidNumericColumnIndex"); end function UnsupportedColumnIndexType(tc) TOriginal = table(1, 2, 3); - arrowRecordBatch = arrow.tabular.RecordBatch(TOriginal); + arrowRecordBatch = arrow.recordbatch(TOriginal); fcn = @() arrowRecordBatch.column(datetime(2022, 1, 3)); tc.verifyError(fcn, "arrow:tabular:recordbatch:UnsupportedColumnIndexType"); end From 839ea601d5241dde43ee59f667eaceec963977b5 Mon Sep 17 00:00:00 2001 From: mwish Date: Mon, 7 Aug 2023 08:41:41 +0800 Subject: [PATCH 094/749] MINOR: [C++] async_until typo fix (#37029) ### Rationale for this change Fix typo in `cpp/src/arrow/acero/source_node.cc` and `cpp/src/arrow/util/async_util.h`. ### What changes are included in this PR? Fix typo in `cpp/src/arrow/acero/source_node.cc` and `cpp/src/arrow/util/async_util.h`. ### Are these changes tested? no ### Are there any user-facing changes? no Authored-by: mwish Signed-off-by: Sutou Kouhei --- cpp/src/arrow/acero/source_node.cc | 4 ++-- cpp/src/arrow/util/async_util.h | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/acero/source_node.cc b/cpp/src/arrow/acero/source_node.cc index ddb062ac005ff..8060e01f074f8 100644 --- a/cpp/src/arrow/acero/source_node.cc +++ b/cpp/src/arrow/acero/source_node.cc @@ -99,7 +99,7 @@ struct SourceNode : ExecNode, public TracedNode { : ExecNode(plan, {}, {}, std::move(output_schema)), TracedNode(this), generator_(std::move(generator)), - ordering_(ordering) {} + ordering_(std::move(ordering)) {} static Result Make(ExecPlan* plan, std::vector inputs, const ExecNodeOptions& options) { @@ -177,7 +177,7 @@ struct SourceNode : ExecNode, public TracedNode { CallbackOptions options; // These options will transfer execution to the desired Executor if necessary. - // This can happen for in-memory scans where batches didn't require + // This can happen for in-memory scans where batches don't require // any CPU work to decode. Otherwise, parsing etc should have already // been placed us on the desired Executor and no queues will be pushed to. options.executor = plan()->query_context()->executor(); diff --git a/cpp/src/arrow/util/async_util.h b/cpp/src/arrow/util/async_util.h index 2668ae222609b..db03719f5e84c 100644 --- a/cpp/src/arrow/util/async_util.h +++ b/cpp/src/arrow/util/async_util.h @@ -58,7 +58,7 @@ namespace util { /// finish. Note, it is not an error to add additional tasks after a scheduler has /// aborted. These tasks will be ignored and never submitted. The scheduler returns a /// future which will complete when all submitted tasks have finished executing. Once all -/// tasks have been finsihed the scheduler is invalid and should no longer be used. +/// tasks have been finished the scheduler is invalid and should no longer be used. /// /// Task failure (either the synchronous portion or the asynchronous portion) will cause /// the scheduler to enter an aborted state. The first such failure will be reported in @@ -117,7 +117,7 @@ class ARROW_EXPORT AsyncTaskScheduler { /// /// A task's name must remain valid for the duration of the task. It is used for /// debugging (e.g. when debugging a deadlock to see which tasks still remain) and for - /// traceability (the name will be used for spans asigned to the task) + /// traceability (the name will be used for spans assigned to the task) /// /// \return true if the task was submitted or queued, false if the task was ignored virtual bool AddTask(std::unique_ptr task) = 0; @@ -262,7 +262,7 @@ class ARROW_EXPORT ThrottledAsyncTaskScheduler : public AsyncTaskScheduler { virtual void Pause() = 0; /// Resume the throttle /// - /// Allows taks to be submitted again. If there is a max_concurrent_cost limit then + /// Allows task to be submitted again. If there is a max_concurrent_cost limit then /// it will still apply. virtual void Resume() = 0; }; @@ -274,7 +274,7 @@ class ARROW_EXPORT ThrottledAsyncTaskScheduler : public AsyncTaskScheduler { virtual void Pause() = 0; /// Resume the throttle /// - /// Allows taks to be submitted again. If there is a max_concurrent_cost limit then + /// Allows task to be submitted again. If there is a max_concurrent_cost limit then /// it will still apply. virtual void Resume() = 0; From 311e8668be5edd029df292a7f0a033bc40db9b8d Mon Sep 17 00:00:00 2001 From: Ikko Eltociear Ashimine Date: Mon, 7 Aug 2023 09:42:53 +0900 Subject: [PATCH 095/749] MINOR: [Docs] Fix a typo in env_vars.rst (#37030) ### Rationale for this change ### What changes are included in this PR? accross -> across ### Are these changes tested? ### Are there any user-facing changes? Authored-by: Ikko Eltociear Ashimine Signed-off-by: Sutou Kouhei --- docs/source/cpp/env_vars.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/cpp/env_vars.rst b/docs/source/cpp/env_vars.rst index 8d10fd2cc2e40..b4d93c7eadd9b 100644 --- a/docs/source/cpp/env_vars.rst +++ b/docs/source/cpp/env_vars.rst @@ -145,7 +145,7 @@ that changing their value later will have an effect. .. envvar:: GANDIVA_CACHE_SIZE The number of entries to keep in the Gandiva JIT compilation cache. - The cache is in-memory and does not persist accross processes. + The cache is in-memory and does not persist across processes. .. envvar:: HADOOP_HOME From 3bb13da316e4ddd4a7aab26a0f099763f1d32b1f Mon Sep 17 00:00:00 2001 From: SGZW Date: Mon, 7 Aug 2023 08:45:53 +0800 Subject: [PATCH 096/749] MINOR: [C++] Fix a typo in Acero agg description (#37031) ### Rationale for this change ### What changes are included in this PR? fix acero agg node typo ### Are these changes tested? ### Are there any user-facing changes? Authored-by: SGZW Signed-off-by: Sutou Kouhei --- cpp/src/arrow/acero/aggregate_internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/acero/aggregate_internal.h b/cpp/src/arrow/acero/aggregate_internal.h index 01861f0242f34..744acb124505a 100644 --- a/cpp/src/arrow/acero/aggregate_internal.h +++ b/cpp/src/arrow/acero/aggregate_internal.h @@ -52,7 +52,7 @@ // segment-keys is used to refine the partitioning. However, segment-keys are different in // that they partition only consecutive rows into a single group. Such a partition of // consecutive rows is called a segment group. For example, consider a column X with -// values [A, A, B, A] at row-indices [0, 1, 2]. A regular group-by aggregation with keys +// values [A, A, B, A] at row-indices [0, 1, 2, 3]. A regular group-by aggregation with keys // [X] yields a row-index partitioning [[0, 1, 3], [2]] whereas a segmented-group-by // aggregation with segment-keys [X] yields [[0, 1], [2], [3]]. // From 2b36521e52f61f6a68e58e7c8c2f7bf2ed805cdc Mon Sep 17 00:00:00 2001 From: Dane Pitkin <48041712+danepitkin@users.noreply.github.com> Date: Sun, 6 Aug 2023 23:38:14 -0400 Subject: [PATCH 097/749] GH-36642: [Python][CI] Configure warnings as errors during pytest (#37018) ### Rationale for this change Warnings are constantly being introduced into the pyarrow tests. Let's try enforcing them as errors in an effort to keep the codebase healthy. ### What changes are included in this PR? * Fixed existing warnings * Set warnings as errors in CI ### Are these changes tested? Yes, ran pytests locally w/o warnings. ### Are there any user-facing changes? No * Closes: #36642 Authored-by: Dane Pitkin Signed-off-by: Sutou Kouhei --- dev/tasks/tasks.yml | 3 ++- docker-compose.yml | 1 + python/pyarrow/tests/parquet/conftest.py | 15 +++++++------ python/pyarrow/tests/parquet/test_dataset.py | 2 +- python/pyarrow/tests/strategies.py | 22 +++++++++++--------- python/pyarrow/tests/test_pandas.py | 5 ++++- python/pyarrow/tests/test_tensor.py | 7 +++++-- 7 files changed, 34 insertions(+), 21 deletions(-) diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 73b793162d959..941506b9c2abc 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1253,6 +1253,7 @@ tasks: params: env: PYTHON: "{{ python_version }}" + PYTEST_ARGS: "-W error" image: conda-python {% endfor %} @@ -1265,7 +1266,7 @@ tasks: HYPOTHESIS_PROFILE: ci PYARROW_TEST_HYPOTHESIS: ON # limit to execute hypothesis tests only - PYTEST_ARGS: "-m hypothesis" + PYTEST_ARGS: "-m hypothesis -W error" image: conda-python-pandas test-conda-python-3.10-substrait: diff --git a/docker-compose.yml b/docker-compose.yml index fe98a30d0b92b..3bf346ef94173 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -835,6 +835,7 @@ services: shm_size: *shm-size environment: <<: [*common, *ccache, *sccache] + PYTEST_ARGS: # inherit volumes: *conda-volumes command: &python-conda-command [" diff --git a/python/pyarrow/tests/parquet/conftest.py b/python/pyarrow/tests/parquet/conftest.py index 1e75493cdae03..461c24af22aa9 100644 --- a/python/pyarrow/tests/parquet/conftest.py +++ b/python/pyarrow/tests/parquet/conftest.py @@ -29,9 +29,10 @@ def datadir(base_datadir): def s3_bucket(s3_server): boto3 = pytest.importorskip('boto3') botocore = pytest.importorskip('botocore') + s3_bucket_name = 'test-s3fs' host, port, access_key, secret_key = s3_server['connection'] - s3 = boto3.resource( + s3_client = boto3.client( 's3', endpoint_url='http://{}:{}'.format(host, port), aws_access_key_id=access_key, @@ -39,13 +40,15 @@ def s3_bucket(s3_server): config=botocore.client.Config(signature_version='s3v4'), region_name='us-east-1' ) - bucket = s3.Bucket('test-s3fs') + try: - bucket.create() + s3_client.create_bucket(Bucket=s3_bucket_name) except Exception: - # we get BucketAlreadyOwnedByYou error with fsspec handler - pass - return 'test-s3fs' + pass # we get BucketAlreadyOwnedByYou error with fsspec handler + finally: + s3_client.close() + + return s3_bucket_name @pytest.fixture diff --git a/python/pyarrow/tests/parquet/test_dataset.py b/python/pyarrow/tests/parquet/test_dataset.py index cd991617c9fa8..3e6ff49265c32 100644 --- a/python/pyarrow/tests/parquet/test_dataset.py +++ b/python/pyarrow/tests/parquet/test_dataset.py @@ -1316,7 +1316,7 @@ def _test_write_to_dataset_with_partitions(base_path, output_df[col] = output_df[col].astype('category') if schema: - expected_date_type = schema.field_by_name('date').type.to_pandas_dtype() + expected_date_type = schema.field('date').type.to_pandas_dtype() output_df["date"] = output_df["date"].astype(expected_date_type) tm.assert_frame_equal(output_df, input_df) diff --git a/python/pyarrow/tests/strategies.py b/python/pyarrow/tests/strategies.py index 48f7e5381724a..bb88a4dcb7b2a 100644 --- a/python/pyarrow/tests/strategies.py +++ b/python/pyarrow/tests/strategies.py @@ -182,15 +182,17 @@ def struct_types(draw, item_strategy=primitive_types): def dictionary_types(key_strategy=None, value_strategy=None): - key_strategy = key_strategy or signed_integer_types - value_strategy = value_strategy or st.one_of( - bool_type, - integer_types, - st.sampled_from([pa.float32(), pa.float64()]), - binary_type, - string_type, - fixed_size_binary_type, - ) + if key_strategy is None: + key_strategy = signed_integer_types + if value_strategy is None: + value_strategy = st.one_of( + bool_type, + integer_types, + st.sampled_from([pa.float32(), pa.float64()]), + binary_type, + string_type, + fixed_size_binary_type, + ) return st.builds(pa.dictionary, key_strategy, value_strategy) @@ -368,7 +370,7 @@ def record_batches(draw, type, rows=None, max_fields=None): children = [draw(arrays(field.type, size=rows)) for field in schema] # TODO(kszucs): the names and schema arguments are not consistent with # Table.from_array's arguments - return pa.RecordBatch.from_arrays(children, names=schema) + return pa.RecordBatch.from_arrays(children, schema=schema) @st.composite diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 8bdc7253a4837..ef6ddd09933c9 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -2913,7 +2913,10 @@ def test_strided_data_import(self): 'f4', 'f8'] for type_name in numeric_dtypes: - cases.append(random_numbers.astype(type_name)) + # Casting np.float64 -> uint32 or uint64 throws a RuntimeWarning + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + cases.append(random_numbers.astype(type_name)) # strings cases.append(np.array([random_ascii(10) for i in range(N * K)], diff --git a/python/pyarrow/tests/test_tensor.py b/python/pyarrow/tests/test_tensor.py index aee46bc93690c..3e6a4ca8ed222 100644 --- a/python/pyarrow/tests/test_tensor.py +++ b/python/pyarrow/tests/test_tensor.py @@ -18,6 +18,7 @@ import os import sys import pytest +import warnings import weakref import numpy as np @@ -82,8 +83,10 @@ def test_tensor_base_object(): @pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) def test_tensor_numpy_roundtrip(dtype_str, arrow_type): dtype = np.dtype(dtype_str) - data = (100 * np.random.randn(10, 4)).astype(dtype) - + # Casting np.float64 -> uint32 or uint64 throws a RuntimeWarning + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + data = (100 * np.random.randn(10, 4)).astype(dtype) tensor = pa.Tensor.from_numpy(data) assert tensor.type == arrow_type From 7cbbd3ee95bc0f7d20bb358e2978e2eb18a05304 Mon Sep 17 00:00:00 2001 From: Anja Kefala Date: Sun, 6 Aug 2023 20:55:25 -0700 Subject: [PATCH 098/749] MINOR: [Docs] add note about how comment bot assigns users (#36362) ### Rationale for this change Each time I wanted to use it, I had to look up the syntax in the github workflows. Most new users wouldn't know about it existing. https://github.com/apache/arrow/blob/e5de6a59f410a3255cc84138b44fc5802b627afc/.github/workflows/comment_bot.yml#L182 ### What changes are included in this PR? Doc addition of text teaching about "take". ### Are there any user-facing changes? Nope Authored-by: anjakefala Signed-off-by: Sutou Kouhei --- docs/source/developers/guide/step_by_step/finding_issues.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/developers/guide/step_by_step/finding_issues.rst b/docs/source/developers/guide/step_by_step/finding_issues.rst index 89c5a81b73dde..a3af1640a3b1d 100644 --- a/docs/source/developers/guide/step_by_step/finding_issues.rst +++ b/docs/source/developers/guide/step_by_step/finding_issues.rst @@ -69,6 +69,7 @@ in the comments. When you find a GitHub issue you would like to work on, please mention your interest in the comment section of that issue; that way we will know you are working on it. + Consider assigning yourself to the issue (:ref:`issue-assignment`) when the work starts. Also, do not hesitate to ask questions in the comment. You can get some pointers about where to start and similar issues already solved. From 3c00b08046be6549cc05714959ddcb993051ac45 Mon Sep 17 00:00:00 2001 From: Kuba Martin Date: Mon, 7 Aug 2023 18:27:26 +0200 Subject: [PATCH 099/749] GH-36936: [Go] Make it possible to register custom functions. (#36959) ### Rationale for this change As discussed in the issue #36936 , this change makes it possible for the consumers of the module to register custom functions, while taking advantage of the existing infrastructure for matching types and choosing kernels. ### What changes are included in this PR? Just moving the package one level higher, so that it's module-exported. ### Are these changes tested? There's no actual code changes other than moving files around and updating imports - existing tests still pass. ### Are there any user-facing changes? The `compute/exec` is now exported. * Closes: #36936 Authored-by: Jakub Martin Signed-off-by: Matt Topol --- go/arrow/compute/arithmetic.go | 2 +- go/arrow/compute/arithmetic_test.go | 2 +- go/arrow/compute/cast.go | 2 +- go/arrow/compute/example_test.go | 91 +++++++++++++++++++ go/arrow/compute/exec.go | 2 +- .../compute/{internal => }/exec/hash_util.go | 0 .../compute/{internal => }/exec/kernel.go | 0 .../{internal => }/exec/kernel_test.go | 2 +- go/arrow/compute/{internal => }/exec/span.go | 0 .../compute/{internal => }/exec/span_test.go | 2 +- go/arrow/compute/{internal => }/exec/utils.go | 0 .../compute/{internal => }/exec/utils_test.go | 2 +- go/arrow/compute/exec_internals_test.go | 2 +- go/arrow/compute/exec_test.go | 2 +- go/arrow/compute/executor.go | 2 +- go/arrow/compute/expression.go | 2 +- go/arrow/compute/exprs/exec.go | 2 +- go/arrow/compute/functions.go | 2 +- .../internal/kernels/base_arithmetic.go | 2 +- .../internal/kernels/base_arithmetic_amd64.go | 2 +- .../kernels/basic_arithmetic_noasm.go | 2 +- .../compute/internal/kernels/boolean_cast.go | 2 +- go/arrow/compute/internal/kernels/cast.go | 2 +- .../compute/internal/kernels/cast_temporal.go | 2 +- go/arrow/compute/internal/kernels/helpers.go | 2 +- .../compute/internal/kernels/numeric_cast.go | 2 +- go/arrow/compute/internal/kernels/rounding.go | 2 +- .../internal/kernels/scalar_arithmetic.go | 2 +- .../internal/kernels/scalar_boolean.go | 2 +- .../kernels/scalar_comparison_amd64.go | 2 +- .../kernels/scalar_comparison_noasm.go | 2 +- .../internal/kernels/scalar_comparisons.go | 2 +- .../compute/internal/kernels/string_casts.go | 2 +- go/arrow/compute/internal/kernels/types.go | 2 +- .../compute/internal/kernels/vector_hash.go | 2 +- .../internal/kernels/vector_run_end_encode.go | 2 +- .../internal/kernels/vector_selection.go | 2 +- go/arrow/compute/registry_test.go | 2 +- go/arrow/compute/scalar_bool.go | 2 +- go/arrow/compute/scalar_compare.go | 2 +- go/arrow/compute/scalar_compare_test.go | 2 +- go/arrow/compute/selection.go | 2 +- go/arrow/compute/utils.go | 2 +- go/arrow/compute/vector_hash_test.go | 2 +- go/arrow/compute/vector_run_end_test.go | 2 +- go/arrow/compute/vector_selection_test.go | 2 +- 46 files changed, 132 insertions(+), 41 deletions(-) create mode 100644 go/arrow/compute/example_test.go rename go/arrow/compute/{internal => }/exec/hash_util.go (100%) rename go/arrow/compute/{internal => }/exec/kernel.go (100%) rename go/arrow/compute/{internal => }/exec/kernel_test.go (99%) rename go/arrow/compute/{internal => }/exec/span.go (100%) rename go/arrow/compute/{internal => }/exec/span_test.go (99%) rename go/arrow/compute/{internal => }/exec/utils.go (100%) rename go/arrow/compute/{internal => }/exec/utils_test.go (98%) diff --git a/go/arrow/compute/arithmetic.go b/go/arrow/compute/arithmetic.go index 2df547e5b4fa4..046cb3f2d9e13 100644 --- a/go/arrow/compute/arithmetic.go +++ b/go/arrow/compute/arithmetic.go @@ -23,7 +23,7 @@ import ( "fmt" "github.com/apache/arrow/go/v13/arrow" - "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v13/arrow/compute/exec" "github.com/apache/arrow/go/v13/arrow/compute/internal/kernels" "github.com/apache/arrow/go/v13/arrow/decimal128" "github.com/apache/arrow/go/v13/arrow/decimal256" diff --git a/go/arrow/compute/arithmetic_test.go b/go/arrow/compute/arithmetic_test.go index c4e0c591cbf54..d9dd799b7cfce 100644 --- a/go/arrow/compute/arithmetic_test.go +++ b/go/arrow/compute/arithmetic_test.go @@ -29,7 +29,7 @@ import ( "github.com/apache/arrow/go/v13/arrow" "github.com/apache/arrow/go/v13/arrow/array" "github.com/apache/arrow/go/v13/arrow/compute" - "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v13/arrow/compute/exec" "github.com/apache/arrow/go/v13/arrow/compute/internal/kernels" "github.com/apache/arrow/go/v13/arrow/decimal128" "github.com/apache/arrow/go/v13/arrow/decimal256" diff --git a/go/arrow/compute/cast.go b/go/arrow/compute/cast.go index aad07413bf3ff..d77273f539595 100644 --- a/go/arrow/compute/cast.go +++ b/go/arrow/compute/cast.go @@ -26,7 +26,7 @@ import ( "github.com/apache/arrow/go/v13/arrow" "github.com/apache/arrow/go/v13/arrow/array" "github.com/apache/arrow/go/v13/arrow/bitutil" - "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v13/arrow/compute/exec" "github.com/apache/arrow/go/v13/arrow/compute/internal/kernels" ) diff --git a/go/arrow/compute/example_test.go b/go/arrow/compute/example_test.go new file mode 100644 index 0000000000000..8d0f25e4c4372 --- /dev/null +++ b/go/arrow/compute/example_test.go @@ -0,0 +1,91 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build go1.18 + +package compute_test + +import ( + "context" + "fmt" + "log" + + "github.com/apache/arrow/go/v13/arrow" + "github.com/apache/arrow/go/v13/arrow/array" + "github.com/apache/arrow/go/v13/arrow/compute" + "github.com/apache/arrow/go/v13/arrow/compute/exec" + "github.com/apache/arrow/go/v13/arrow/memory" +) + +// This example demonstrates how to register a custom scalar function. +func Example_customFunction() { + pool := memory.NewGoAllocator() + + ctx := context.Background() + execCtx := compute.DefaultExecCtx() + ctx = compute.SetExecCtx(ctx, execCtx) + + add42 := compute.NewScalarFunction("add_42", compute.Arity{ + NArgs: 1, + }, compute.FunctionDoc{ + Summary: "Returns the input values plus 42", + ArgNames: []string{"input"}, + }) + + if err := add42.AddNewKernel( + []exec.InputType{ + // We accept a single argument (array) of Int8 type. + { + Kind: exec.InputExact, + Type: arrow.PrimitiveTypes.Int8, + }, + }, + // We'll return a single Int8 array. + exec.NewOutputType(arrow.PrimitiveTypes.Int8), + func(ctx *exec.KernelCtx, span *exec.ExecSpan, result *exec.ExecResult) error { + // The second buffer contains the values. Both for the input and the output arrays. + for i, x := range span.Values[0].Array.Buffers[1].Buf { + result.Buffers[1].Buf[i] = x + 42 + } + return nil + }, + nil, + ); err != nil { + log.Fatal(err) + } + execCtx.Registry.AddFunction(add42, true) + + inputArrayBuilder := array.NewInt8Builder(pool) + for i := 0; i < 16; i++ { + inputArrayBuilder.Append(int8(i)) + } + inputArray := inputArrayBuilder.NewArray() + + outputArrayDatum, err := compute.CallFunction( + compute.SetExecCtx(context.Background(), execCtx), + "add_42", + nil, + &compute.ArrayDatum{Value: inputArray.Data()}, + ) + if err != nil { + log.Fatal(err) + } + + fmt.Println(array.NewInt8Data(outputArrayDatum.(*compute.ArrayDatum).Value).Int8Values()) + + // Output: + // [42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57] +} diff --git a/go/arrow/compute/exec.go b/go/arrow/compute/exec.go index 6dbef8cdfbbd9..c478a1dcdef7d 100644 --- a/go/arrow/compute/exec.go +++ b/go/arrow/compute/exec.go @@ -23,7 +23,7 @@ import ( "fmt" "github.com/apache/arrow/go/v13/arrow" - "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v13/arrow/compute/exec" "github.com/apache/arrow/go/v13/arrow/internal/debug" ) diff --git a/go/arrow/compute/internal/exec/hash_util.go b/go/arrow/compute/exec/hash_util.go similarity index 100% rename from go/arrow/compute/internal/exec/hash_util.go rename to go/arrow/compute/exec/hash_util.go diff --git a/go/arrow/compute/internal/exec/kernel.go b/go/arrow/compute/exec/kernel.go similarity index 100% rename from go/arrow/compute/internal/exec/kernel.go rename to go/arrow/compute/exec/kernel.go diff --git a/go/arrow/compute/internal/exec/kernel_test.go b/go/arrow/compute/exec/kernel_test.go similarity index 99% rename from go/arrow/compute/internal/exec/kernel_test.go rename to go/arrow/compute/exec/kernel_test.go index 827ac97e63a01..65fc41e4e4d56 100644 --- a/go/arrow/compute/internal/exec/kernel_test.go +++ b/go/arrow/compute/exec/kernel_test.go @@ -25,7 +25,7 @@ import ( "github.com/apache/arrow/go/v13/arrow" "github.com/apache/arrow/go/v13/arrow/array" "github.com/apache/arrow/go/v13/arrow/compute" - "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v13/arrow/compute/exec" "github.com/apache/arrow/go/v13/arrow/memory" "github.com/apache/arrow/go/v13/arrow/scalar" "github.com/stretchr/testify/assert" diff --git a/go/arrow/compute/internal/exec/span.go b/go/arrow/compute/exec/span.go similarity index 100% rename from go/arrow/compute/internal/exec/span.go rename to go/arrow/compute/exec/span.go diff --git a/go/arrow/compute/internal/exec/span_test.go b/go/arrow/compute/exec/span_test.go similarity index 99% rename from go/arrow/compute/internal/exec/span_test.go rename to go/arrow/compute/exec/span_test.go index 2423824bbe5a7..3cbd54f0d4094 100644 --- a/go/arrow/compute/internal/exec/span_test.go +++ b/go/arrow/compute/exec/span_test.go @@ -26,7 +26,7 @@ import ( "github.com/apache/arrow/go/v13/arrow" "github.com/apache/arrow/go/v13/arrow/array" - "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v13/arrow/compute/exec" "github.com/apache/arrow/go/v13/arrow/decimal128" "github.com/apache/arrow/go/v13/arrow/endian" "github.com/apache/arrow/go/v13/arrow/memory" diff --git a/go/arrow/compute/internal/exec/utils.go b/go/arrow/compute/exec/utils.go similarity index 100% rename from go/arrow/compute/internal/exec/utils.go rename to go/arrow/compute/exec/utils.go diff --git a/go/arrow/compute/internal/exec/utils_test.go b/go/arrow/compute/exec/utils_test.go similarity index 98% rename from go/arrow/compute/internal/exec/utils_test.go rename to go/arrow/compute/exec/utils_test.go index 40b98b26e7d1e..4f908bb2057b1 100644 --- a/go/arrow/compute/internal/exec/utils_test.go +++ b/go/arrow/compute/exec/utils_test.go @@ -23,7 +23,7 @@ import ( "github.com/apache/arrow/go/v13/arrow" "github.com/apache/arrow/go/v13/arrow/array" - "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v13/arrow/compute/exec" "github.com/apache/arrow/go/v13/arrow/memory" "github.com/stretchr/testify/assert" ) diff --git a/go/arrow/compute/exec_internals_test.go b/go/arrow/compute/exec_internals_test.go index 4d259aa27394a..3247becd969a3 100644 --- a/go/arrow/compute/exec_internals_test.go +++ b/go/arrow/compute/exec_internals_test.go @@ -27,7 +27,7 @@ import ( "github.com/apache/arrow/go/v13/arrow" "github.com/apache/arrow/go/v13/arrow/array" "github.com/apache/arrow/go/v13/arrow/bitutil" - "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v13/arrow/compute/exec" "github.com/apache/arrow/go/v13/arrow/internal/testing/gen" "github.com/apache/arrow/go/v13/arrow/memory" "github.com/apache/arrow/go/v13/arrow/scalar" diff --git a/go/arrow/compute/exec_test.go b/go/arrow/compute/exec_test.go index fb97c3c803f6b..308e30aac7316 100644 --- a/go/arrow/compute/exec_test.go +++ b/go/arrow/compute/exec_test.go @@ -25,7 +25,7 @@ import ( "github.com/apache/arrow/go/v13/arrow" "github.com/apache/arrow/go/v13/arrow/array" "github.com/apache/arrow/go/v13/arrow/bitutil" - "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v13/arrow/compute/exec" "github.com/apache/arrow/go/v13/arrow/internal/debug" "github.com/apache/arrow/go/v13/arrow/scalar" "github.com/stretchr/testify/suite" diff --git a/go/arrow/compute/executor.go b/go/arrow/compute/executor.go index d3f1a1fd41d4c..962f41019dac2 100644 --- a/go/arrow/compute/executor.go +++ b/go/arrow/compute/executor.go @@ -28,7 +28,7 @@ import ( "github.com/apache/arrow/go/v13/arrow" "github.com/apache/arrow/go/v13/arrow/array" "github.com/apache/arrow/go/v13/arrow/bitutil" - "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v13/arrow/compute/exec" "github.com/apache/arrow/go/v13/arrow/internal" "github.com/apache/arrow/go/v13/arrow/internal/debug" "github.com/apache/arrow/go/v13/arrow/memory" diff --git a/go/arrow/compute/expression.go b/go/arrow/compute/expression.go index b01c3b67133ad..04128bef7738c 100644 --- a/go/arrow/compute/expression.go +++ b/go/arrow/compute/expression.go @@ -30,7 +30,7 @@ import ( "github.com/apache/arrow/go/v13/arrow" "github.com/apache/arrow/go/v13/arrow/array" - "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v13/arrow/compute/exec" "github.com/apache/arrow/go/v13/arrow/compute/internal/kernels" "github.com/apache/arrow/go/v13/arrow/internal/debug" "github.com/apache/arrow/go/v13/arrow/ipc" diff --git a/go/arrow/compute/exprs/exec.go b/go/arrow/compute/exprs/exec.go index 97b16ede11464..7683587478bf2 100644 --- a/go/arrow/compute/exprs/exec.go +++ b/go/arrow/compute/exprs/exec.go @@ -26,7 +26,7 @@ import ( "github.com/apache/arrow/go/v13/arrow" "github.com/apache/arrow/go/v13/arrow/array" "github.com/apache/arrow/go/v13/arrow/compute" - "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v13/arrow/compute/exec" "github.com/apache/arrow/go/v13/arrow/decimal128" "github.com/apache/arrow/go/v13/arrow/endian" "github.com/apache/arrow/go/v13/arrow/internal/debug" diff --git a/go/arrow/compute/functions.go b/go/arrow/compute/functions.go index 3943fbca3597f..887346f7f5d06 100644 --- a/go/arrow/compute/functions.go +++ b/go/arrow/compute/functions.go @@ -24,7 +24,7 @@ import ( "strings" "github.com/apache/arrow/go/v13/arrow" - "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v13/arrow/compute/exec" ) type Function interface { diff --git a/go/arrow/compute/internal/kernels/base_arithmetic.go b/go/arrow/compute/internal/kernels/base_arithmetic.go index 5da1b29a63dc8..bfdb7b8f2103e 100644 --- a/go/arrow/compute/internal/kernels/base_arithmetic.go +++ b/go/arrow/compute/internal/kernels/base_arithmetic.go @@ -25,7 +25,7 @@ import ( "github.com/JohnCGriffin/overflow" "github.com/apache/arrow/go/v13/arrow" - "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v13/arrow/compute/exec" "github.com/apache/arrow/go/v13/arrow/decimal128" "github.com/apache/arrow/go/v13/arrow/decimal256" "github.com/apache/arrow/go/v13/arrow/internal/debug" diff --git a/go/arrow/compute/internal/kernels/base_arithmetic_amd64.go b/go/arrow/compute/internal/kernels/base_arithmetic_amd64.go index 432acc3d81f8b..9683ac9420fdd 100644 --- a/go/arrow/compute/internal/kernels/base_arithmetic_amd64.go +++ b/go/arrow/compute/internal/kernels/base_arithmetic_amd64.go @@ -21,7 +21,7 @@ package kernels import ( "unsafe" - "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v13/arrow/compute/exec" "github.com/apache/arrow/go/v13/arrow/internal/debug" "golang.org/x/exp/constraints" "golang.org/x/sys/cpu" diff --git a/go/arrow/compute/internal/kernels/basic_arithmetic_noasm.go b/go/arrow/compute/internal/kernels/basic_arithmetic_noasm.go index 1a2874df704f2..767425d61e221 100644 --- a/go/arrow/compute/internal/kernels/basic_arithmetic_noasm.go +++ b/go/arrow/compute/internal/kernels/basic_arithmetic_noasm.go @@ -19,7 +19,7 @@ package kernels import ( - "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v13/arrow/compute/exec" "golang.org/x/exp/constraints" ) diff --git a/go/arrow/compute/internal/kernels/boolean_cast.go b/go/arrow/compute/internal/kernels/boolean_cast.go index 7e1ff50c816df..b0ca7c2d9157a 100644 --- a/go/arrow/compute/internal/kernels/boolean_cast.go +++ b/go/arrow/compute/internal/kernels/boolean_cast.go @@ -24,7 +24,7 @@ import ( "github.com/apache/arrow/go/v13/arrow" "github.com/apache/arrow/go/v13/arrow/bitutil" - "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v13/arrow/compute/exec" ) func isNonZero[T exec.FixedWidthTypes](ctx *exec.KernelCtx, in []T, out []byte) error { diff --git a/go/arrow/compute/internal/kernels/cast.go b/go/arrow/compute/internal/kernels/cast.go index 80be6ca15cc25..50cf775d2985e 100644 --- a/go/arrow/compute/internal/kernels/cast.go +++ b/go/arrow/compute/internal/kernels/cast.go @@ -21,7 +21,7 @@ package kernels import ( "github.com/apache/arrow/go/v13/arrow" "github.com/apache/arrow/go/v13/arrow/array" - "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v13/arrow/compute/exec" ) type CastOptions struct { diff --git a/go/arrow/compute/internal/kernels/cast_temporal.go b/go/arrow/compute/internal/kernels/cast_temporal.go index dbc92d2a3df0e..8201119a4edbc 100644 --- a/go/arrow/compute/internal/kernels/cast_temporal.go +++ b/go/arrow/compute/internal/kernels/cast_temporal.go @@ -26,7 +26,7 @@ import ( "github.com/apache/arrow/go/v13/arrow" "github.com/apache/arrow/go/v13/arrow/bitutil" - "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v13/arrow/compute/exec" "github.com/apache/arrow/go/v13/arrow/internal/debug" ) diff --git a/go/arrow/compute/internal/kernels/helpers.go b/go/arrow/compute/internal/kernels/helpers.go index fe91676f3bbc0..99816bc9bd43a 100644 --- a/go/arrow/compute/internal/kernels/helpers.go +++ b/go/arrow/compute/internal/kernels/helpers.go @@ -24,7 +24,7 @@ import ( "github.com/apache/arrow/go/v13/arrow" "github.com/apache/arrow/go/v13/arrow/bitutil" - "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v13/arrow/compute/exec" "github.com/apache/arrow/go/v13/arrow/internal/debug" "github.com/apache/arrow/go/v13/arrow/memory" "github.com/apache/arrow/go/v13/arrow/scalar" diff --git a/go/arrow/compute/internal/kernels/numeric_cast.go b/go/arrow/compute/internal/kernels/numeric_cast.go index a7258f235418e..f0e469f15f338 100644 --- a/go/arrow/compute/internal/kernels/numeric_cast.go +++ b/go/arrow/compute/internal/kernels/numeric_cast.go @@ -25,7 +25,7 @@ import ( "github.com/apache/arrow/go/v13/arrow" "github.com/apache/arrow/go/v13/arrow/bitutil" - "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v13/arrow/compute/exec" "github.com/apache/arrow/go/v13/arrow/decimal128" "github.com/apache/arrow/go/v13/arrow/decimal256" "github.com/apache/arrow/go/v13/arrow/internal/debug" diff --git a/go/arrow/compute/internal/kernels/rounding.go b/go/arrow/compute/internal/kernels/rounding.go index ad385e4dce8e6..93f5829018f67 100644 --- a/go/arrow/compute/internal/kernels/rounding.go +++ b/go/arrow/compute/internal/kernels/rounding.go @@ -23,7 +23,7 @@ import ( "math" "github.com/apache/arrow/go/v13/arrow" - "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v13/arrow/compute/exec" "github.com/apache/arrow/go/v13/arrow/decimal128" "github.com/apache/arrow/go/v13/arrow/decimal256" "github.com/apache/arrow/go/v13/arrow/scalar" diff --git a/go/arrow/compute/internal/kernels/scalar_arithmetic.go b/go/arrow/compute/internal/kernels/scalar_arithmetic.go index 01622e0a4df94..3f6832cbbc583 100644 --- a/go/arrow/compute/internal/kernels/scalar_arithmetic.go +++ b/go/arrow/compute/internal/kernels/scalar_arithmetic.go @@ -24,7 +24,7 @@ import ( "github.com/apache/arrow/go/v13/arrow" "github.com/apache/arrow/go/v13/arrow/bitutil" - "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v13/arrow/compute/exec" "github.com/apache/arrow/go/v13/arrow/decimal128" "github.com/apache/arrow/go/v13/arrow/decimal256" "github.com/apache/arrow/go/v13/arrow/internal/debug" diff --git a/go/arrow/compute/internal/kernels/scalar_boolean.go b/go/arrow/compute/internal/kernels/scalar_boolean.go index 3c4916a1d94fe..59ea7627a204f 100644 --- a/go/arrow/compute/internal/kernels/scalar_boolean.go +++ b/go/arrow/compute/internal/kernels/scalar_boolean.go @@ -20,7 +20,7 @@ package kernels import ( "github.com/apache/arrow/go/v13/arrow/bitutil" - "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v13/arrow/compute/exec" "github.com/apache/arrow/go/v13/arrow/scalar" ) diff --git a/go/arrow/compute/internal/kernels/scalar_comparison_amd64.go b/go/arrow/compute/internal/kernels/scalar_comparison_amd64.go index 7d1db042cadae..a4cecb80bbd2f 100644 --- a/go/arrow/compute/internal/kernels/scalar_comparison_amd64.go +++ b/go/arrow/compute/internal/kernels/scalar_comparison_amd64.go @@ -22,7 +22,7 @@ import ( "unsafe" "github.com/apache/arrow/go/v13/arrow" - "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v13/arrow/compute/exec" "golang.org/x/sys/cpu" ) diff --git a/go/arrow/compute/internal/kernels/scalar_comparison_noasm.go b/go/arrow/compute/internal/kernels/scalar_comparison_noasm.go index e877610b372cf..204eaa6d448ea 100644 --- a/go/arrow/compute/internal/kernels/scalar_comparison_noasm.go +++ b/go/arrow/compute/internal/kernels/scalar_comparison_noasm.go @@ -18,7 +18,7 @@ package kernels -import "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" +import "github.com/apache/arrow/go/v13/arrow/compute/exec" func genCompareKernel[T exec.NumericTypes](op CompareOperator) *CompareData { return genGoCompareKernel(getCmpOp[T](op)) diff --git a/go/arrow/compute/internal/kernels/scalar_comparisons.go b/go/arrow/compute/internal/kernels/scalar_comparisons.go index 5e905b514c9e0..8d4ae244397d3 100644 --- a/go/arrow/compute/internal/kernels/scalar_comparisons.go +++ b/go/arrow/compute/internal/kernels/scalar_comparisons.go @@ -25,7 +25,7 @@ import ( "github.com/apache/arrow/go/v13/arrow" "github.com/apache/arrow/go/v13/arrow/bitutil" - "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v13/arrow/compute/exec" "github.com/apache/arrow/go/v13/arrow/decimal128" "github.com/apache/arrow/go/v13/arrow/decimal256" "github.com/apache/arrow/go/v13/arrow/internal/debug" diff --git a/go/arrow/compute/internal/kernels/string_casts.go b/go/arrow/compute/internal/kernels/string_casts.go index 344b10364c138..db278fd853d1d 100644 --- a/go/arrow/compute/internal/kernels/string_casts.go +++ b/go/arrow/compute/internal/kernels/string_casts.go @@ -26,7 +26,7 @@ import ( "github.com/apache/arrow/go/v13/arrow" "github.com/apache/arrow/go/v13/arrow/array" "github.com/apache/arrow/go/v13/arrow/bitutil" - "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v13/arrow/compute/exec" "github.com/apache/arrow/go/v13/arrow/float16" "github.com/apache/arrow/go/v13/internal/bitutils" ) diff --git a/go/arrow/compute/internal/kernels/types.go b/go/arrow/compute/internal/kernels/types.go index 7743b076397c9..b1d0fa0403229 100644 --- a/go/arrow/compute/internal/kernels/types.go +++ b/go/arrow/compute/internal/kernels/types.go @@ -22,7 +22,7 @@ import ( "fmt" "github.com/apache/arrow/go/v13/arrow" - "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v13/arrow/compute/exec" "github.com/apache/arrow/go/v13/arrow/internal/debug" "github.com/apache/arrow/go/v13/arrow/scalar" ) diff --git a/go/arrow/compute/internal/kernels/vector_hash.go b/go/arrow/compute/internal/kernels/vector_hash.go index abae8e1b0b287..ee592b98b87a5 100644 --- a/go/arrow/compute/internal/kernels/vector_hash.go +++ b/go/arrow/compute/internal/kernels/vector_hash.go @@ -23,7 +23,7 @@ import ( "github.com/apache/arrow/go/v13/arrow" "github.com/apache/arrow/go/v13/arrow/array" - "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v13/arrow/compute/exec" "github.com/apache/arrow/go/v13/arrow/internal/debug" "github.com/apache/arrow/go/v13/arrow/memory" "github.com/apache/arrow/go/v13/internal/bitutils" diff --git a/go/arrow/compute/internal/kernels/vector_run_end_encode.go b/go/arrow/compute/internal/kernels/vector_run_end_encode.go index 3c5b673f657a8..e5ea93a3ebc47 100644 --- a/go/arrow/compute/internal/kernels/vector_run_end_encode.go +++ b/go/arrow/compute/internal/kernels/vector_run_end_encode.go @@ -26,7 +26,7 @@ import ( "github.com/apache/arrow/go/v13/arrow" "github.com/apache/arrow/go/v13/arrow/bitutil" - "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v13/arrow/compute/exec" "github.com/apache/arrow/go/v13/arrow/decimal128" "github.com/apache/arrow/go/v13/arrow/decimal256" "github.com/apache/arrow/go/v13/arrow/float16" diff --git a/go/arrow/compute/internal/kernels/vector_selection.go b/go/arrow/compute/internal/kernels/vector_selection.go index 8edefe2453997..73b54fdeb5420 100644 --- a/go/arrow/compute/internal/kernels/vector_selection.go +++ b/go/arrow/compute/internal/kernels/vector_selection.go @@ -25,7 +25,7 @@ import ( "github.com/apache/arrow/go/v13/arrow" "github.com/apache/arrow/go/v13/arrow/array" "github.com/apache/arrow/go/v13/arrow/bitutil" - "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v13/arrow/compute/exec" "github.com/apache/arrow/go/v13/arrow/internal/debug" "github.com/apache/arrow/go/v13/arrow/memory" "github.com/apache/arrow/go/v13/internal/bitutils" diff --git a/go/arrow/compute/registry_test.go b/go/arrow/compute/registry_test.go index e06bd47a7cab3..c447364359287 100644 --- a/go/arrow/compute/registry_test.go +++ b/go/arrow/compute/registry_test.go @@ -25,7 +25,7 @@ import ( "github.com/apache/arrow/go/v13/arrow" "github.com/apache/arrow/go/v13/arrow/compute" - "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v13/arrow/compute/exec" "github.com/stretchr/testify/assert" "golang.org/x/exp/slices" ) diff --git a/go/arrow/compute/scalar_bool.go b/go/arrow/compute/scalar_bool.go index e47830ebc96b7..5678a1fb6943e 100644 --- a/go/arrow/compute/scalar_bool.go +++ b/go/arrow/compute/scalar_bool.go @@ -22,7 +22,7 @@ import ( "fmt" "github.com/apache/arrow/go/v13/arrow" - "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v13/arrow/compute/exec" "github.com/apache/arrow/go/v13/arrow/compute/internal/kernels" ) diff --git a/go/arrow/compute/scalar_compare.go b/go/arrow/compute/scalar_compare.go index dc4d807e3d671..14c0a4d25d0da 100644 --- a/go/arrow/compute/scalar_compare.go +++ b/go/arrow/compute/scalar_compare.go @@ -22,7 +22,7 @@ import ( "context" "github.com/apache/arrow/go/v13/arrow" - "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v13/arrow/compute/exec" "github.com/apache/arrow/go/v13/arrow/compute/internal/kernels" ) diff --git a/go/arrow/compute/scalar_compare_test.go b/go/arrow/compute/scalar_compare_test.go index 460f856e28d47..4643e26ce1873 100644 --- a/go/arrow/compute/scalar_compare_test.go +++ b/go/arrow/compute/scalar_compare_test.go @@ -28,7 +28,7 @@ import ( "github.com/apache/arrow/go/v13/arrow/array" "github.com/apache/arrow/go/v13/arrow/bitutil" "github.com/apache/arrow/go/v13/arrow/compute" - "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v13/arrow/compute/exec" "github.com/apache/arrow/go/v13/arrow/compute/internal/kernels" "github.com/apache/arrow/go/v13/arrow/internal/testing/gen" "github.com/apache/arrow/go/v13/arrow/memory" diff --git a/go/arrow/compute/selection.go b/go/arrow/compute/selection.go index 2ce749a053b1f..4593ba0daf4df 100644 --- a/go/arrow/compute/selection.go +++ b/go/arrow/compute/selection.go @@ -24,7 +24,7 @@ import ( "github.com/apache/arrow/go/v13/arrow" "github.com/apache/arrow/go/v13/arrow/array" - "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v13/arrow/compute/exec" "github.com/apache/arrow/go/v13/arrow/compute/internal/kernels" "golang.org/x/sync/errgroup" ) diff --git a/go/arrow/compute/utils.go b/go/arrow/compute/utils.go index b22b26b2ac7d1..5b3aef1691848 100644 --- a/go/arrow/compute/utils.go +++ b/go/arrow/compute/utils.go @@ -26,7 +26,7 @@ import ( "github.com/apache/arrow/go/v13/arrow" "github.com/apache/arrow/go/v13/arrow/bitutil" - "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v13/arrow/compute/exec" "github.com/apache/arrow/go/v13/arrow/compute/internal/kernels" "github.com/apache/arrow/go/v13/arrow/internal/debug" "github.com/apache/arrow/go/v13/arrow/memory" diff --git a/go/arrow/compute/vector_hash_test.go b/go/arrow/compute/vector_hash_test.go index c3aae265c7c51..e83687b12700d 100644 --- a/go/arrow/compute/vector_hash_test.go +++ b/go/arrow/compute/vector_hash_test.go @@ -26,7 +26,7 @@ import ( "github.com/apache/arrow/go/v13/arrow" "github.com/apache/arrow/go/v13/arrow/array" "github.com/apache/arrow/go/v13/arrow/compute" - "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v13/arrow/compute/exec" "github.com/apache/arrow/go/v13/arrow/decimal128" "github.com/apache/arrow/go/v13/arrow/decimal256" "github.com/apache/arrow/go/v13/arrow/memory" diff --git a/go/arrow/compute/vector_run_end_test.go b/go/arrow/compute/vector_run_end_test.go index e00990577ae72..62ea7dba015a2 100644 --- a/go/arrow/compute/vector_run_end_test.go +++ b/go/arrow/compute/vector_run_end_test.go @@ -29,7 +29,7 @@ import ( "github.com/apache/arrow/go/v13/arrow/array" "github.com/apache/arrow/go/v13/arrow/bitutil" "github.com/apache/arrow/go/v13/arrow/compute" - "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v13/arrow/compute/exec" "github.com/apache/arrow/go/v13/arrow/internal/testing/gen" "github.com/apache/arrow/go/v13/arrow/memory" "github.com/stretchr/testify/suite" diff --git a/go/arrow/compute/vector_selection_test.go b/go/arrow/compute/vector_selection_test.go index eec403fc90e73..2a97f4c1301bb 100644 --- a/go/arrow/compute/vector_selection_test.go +++ b/go/arrow/compute/vector_selection_test.go @@ -27,7 +27,7 @@ import ( "github.com/apache/arrow/go/v13/arrow" "github.com/apache/arrow/go/v13/arrow/array" "github.com/apache/arrow/go/v13/arrow/compute" - "github.com/apache/arrow/go/v13/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v13/arrow/compute/exec" "github.com/apache/arrow/go/v13/arrow/compute/internal/kernels" "github.com/apache/arrow/go/v13/arrow/internal/testing/gen" "github.com/apache/arrow/go/v13/arrow/memory" From 71329ce33a18a53e322514d0e463677ebad648c9 Mon Sep 17 00:00:00 2001 From: sgilmore10 <74676073+sgilmore10@users.noreply.github.com> Date: Mon, 7 Aug 2023 15:22:03 -0400 Subject: [PATCH 100/749] GH-37042: [MATLAB] Implement Feather V1 Writer using new MATLAB Interface APIs (#37043) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Now that we've have the basic building blocks for tabular IO in the MATLAB Interface (`Array`, `Schema`, `RecordBatch`), we can implement a Feather V1 writer in terms of the new APIs. This is the first in a series of pull requests in which we will work on replacing the legacy feather V1 infrastructure with a new implementation that use the MATLAB Interface APIs. A side effect of doing this work is that we can eventually delete a lot of legacy build infrastructure and code. ### What changes are included in this PR? 1. Added a new class called `arrow.internal.io.feather.Writer` which can be used to write feather V1 files. It has one public property named `Filename` and one public method `write`. Below is an example of its usage: ```matlab >> T = table([1; 2; 3], single([10; 11; 12])); T = 3×2 table Var1 Var2 ____ ____ 1 10 2 11 3 12 >> filename = "/tmp/table.feather"; >> writer = arrow.internal.io.feather.Writer(filename) writer = Writer with properties: Filename: "/tmp/table.feather" >> writer.write(T); ``` 2. Added an `unwrap` method to `proxy::RecordBatch` so that the `FeatherWriter::write` method can access the underlying `RecordBatch` from the proxy. 3. Changed the `SetAccess` and `GetAccess` of the `Proxy` property on `arrow.tabular.RecordBatch` to `private` and `public`, respectively. ### Are these changes tested? Yes, added a new test file called `tRoundTrip.m` in the `matlab/test/arrow/io/feather` folder. ### Are there any user-facing changes? No. ### Future Directions 1. Add a new class for reading feather V1 files (See #37041). 2. Integrate this class in the public `featherwrite` function. 5. Once this class is integrated with `featherwrite`, we can delete the legacy build infrastructure and source code. * Closes: #37042 Authored-by: Sarah Gilmore Signed-off-by: Kevin Gurney --- matlab/src/cpp/arrow/matlab/error/error.h | 4 + .../matlab/io/feather/proxy/feather_writer.cc | 90 +++++++++++++++++++ .../matlab/io/feather/proxy/feather_writer.h | 41 +++++++++ matlab/src/cpp/arrow/matlab/proxy/factory.cc | 2 + .../matlab/tabular/proxy/record_batch.cc | 4 + .../arrow/matlab/tabular/proxy/record_batch.h | 2 + .../+arrow/+internal/+io/+feather/Writer.m | 48 ++++++++++ .../src/matlab/+arrow/+tabular/RecordBatch.m | 2 +- matlab/test/arrow/io/feather/tRoundTrip.m | 52 +++++++++++ .../cmake/BuildMatlabArrowInterface.cmake | 4 +- 10 files changed, 247 insertions(+), 2 deletions(-) create mode 100644 matlab/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.cc create mode 100644 matlab/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.h create mode 100644 matlab/src/matlab/+arrow/+internal/+io/+feather/Writer.m create mode 100644 matlab/test/arrow/io/feather/tRoundTrip.m diff --git a/matlab/src/cpp/arrow/matlab/error/error.h b/matlab/src/cpp/arrow/matlab/error/error.h index b7c0d7d696d65..e1d2982f282dd 100644 --- a/matlab/src/cpp/arrow/matlab/error/error.h +++ b/matlab/src/cpp/arrow/matlab/error/error.h @@ -180,4 +180,8 @@ namespace arrow::matlab::error { static const char* UNKNOWN_PROXY_FOR_ARRAY_TYPE = "arrow:array:UnknownProxyForArrayType"; static const char* RECORD_BATCH_NUMERIC_INDEX_WITH_EMPTY_RECORD_BATCH = "arrow:tabular:recordbatch:NumericIndexWithEmptyRecordBatch"; static const char* RECORD_BATCH_INVALID_NUMERIC_COLUMN_INDEX = "arrow:tabular:recordbatch:InvalidNumericColumnIndex"; + static const char* FAILED_TO_OPEN_FILE_FOR_WRITE = "arrow:io:FailedToOpenFileForWrite"; + static const char* FEATHER_FAILED_TO_WRITE_TABLE = "arrow:io:feather:FailedToWriteTable"; + static const char* TABLE_FROM_RECORD_BATCH = "arrow:table:FromRecordBatch"; + } diff --git a/matlab/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.cc b/matlab/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.cc new file mode 100644 index 0000000000000..a27e1fb0e623a --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.cc @@ -0,0 +1,90 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/matlab/io/feather/proxy/feather_writer.h" +#include "arrow/matlab/tabular/proxy/record_batch.h" +#include "arrow/matlab/error/error.h" + +#include "arrow/result.h" +#include "arrow/table.h" +#include "arrow/util/utf8.h" + +#include "arrow/io/file.h" +#include "arrow/ipc/feather.h" + +#include "libmexclass/proxy/ProxyManager.h" + +namespace arrow::matlab::io::feather::proxy { + + FeatherWriter::FeatherWriter(const std::string& filename) : filename{filename} { + REGISTER_METHOD(FeatherWriter, getFilename); + REGISTER_METHOD(FeatherWriter, write); + } + + libmexclass::proxy::MakeResult FeatherWriter::make(const libmexclass::proxy::FunctionArguments& constructor_arguments) { + namespace mda = ::matlab::data; + mda::StructArray opts = constructor_arguments[0]; + const mda::StringArray filename_mda = opts[0]["Filename"]; + + const auto filename_utf16 = std::u16string(filename_mda[0]); + MATLAB_ASSIGN_OR_ERROR(const auto filename_utf8, + arrow::util::UTF16StringToUTF8(filename_utf16), + error::UNICODE_CONVERSION_ERROR_ID); + + return std::make_shared(filename_utf8); + } + + void FeatherWriter::getFilename(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto utf16_filename, + arrow::util::UTF8StringToUTF16(filename), + context, + error::UNICODE_CONVERSION_ERROR_ID); + mda::ArrayFactory factory; + auto str_mda = factory.createScalar(utf16_filename); + context.outputs[0] = str_mda; + } + + void FeatherWriter::write(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + mda::StructArray opts = context.inputs[0]; + const mda::TypedArray record_batch_proxy_id_mda = opts[0]["RecordBatchProxyID"]; + const uint64_t record_batch_proxy_id = record_batch_proxy_id_mda[0]; + + auto proxy = libmexclass::proxy::ProxyManager::getProxy(record_batch_proxy_id); + auto record_batch_proxy = std::static_pointer_cast(proxy); + auto record_batch = record_batch_proxy->unwrap(); + + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto table, + arrow::Table::FromRecordBatches({record_batch}), + context, + error::TABLE_FROM_RECORD_BATCH); + + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(std::shared_ptr output_stream, + arrow::io::FileOutputStream::Open(filename), + context, + error::FAILED_TO_OPEN_FILE_FOR_WRITE); + + // Specify the feather file format version as V1 + arrow::ipc::feather::WriteProperties write_props; + write_props.version = arrow::ipc::feather::kFeatherV1Version; + + MATLAB_ERROR_IF_NOT_OK_WITH_CONTEXT(ipc::feather::WriteTable(*table, output_stream.get(), write_props), + context, + error::FEATHER_FAILED_TO_WRITE_TABLE); + } +} diff --git a/matlab/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.h b/matlab/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.h new file mode 100644 index 0000000000000..dadb479887891 --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.h @@ -0,0 +1,41 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/status.h" + +#include "libmexclass/proxy/Proxy.h" + +namespace arrow::matlab::io::feather::proxy { + + class FeatherWriter : public libmexclass::proxy::Proxy { + public: + FeatherWriter(const std::string& filename); + + ~FeatherWriter() {} + + static libmexclass::proxy::MakeResult make(const libmexclass::proxy::FunctionArguments& constructor_arguments); + + protected: + void getFilename(libmexclass::proxy::method::Context& context); + void write(libmexclass::proxy::method::Context& context); + + private: + const std::string filename; + }; +} diff --git a/matlab/src/cpp/arrow/matlab/proxy/factory.cc b/matlab/src/cpp/arrow/matlab/proxy/factory.cc index 7d18c6c6b62a8..7a2a4f3192fa1 100644 --- a/matlab/src/cpp/arrow/matlab/proxy/factory.cc +++ b/matlab/src/cpp/arrow/matlab/proxy/factory.cc @@ -25,6 +25,7 @@ #include "arrow/matlab/type/proxy/string_type.h" #include "arrow/matlab/type/proxy/timestamp_type.h" #include "arrow/matlab/type/proxy/field.h" +#include "arrow/matlab/io/feather/proxy/feather_writer.h" #include "factory.h" @@ -60,6 +61,7 @@ libmexclass::proxy::MakeResult Factory::make_proxy(const ClassName& class_name, REGISTER_PROXY(arrow.type.proxy.BooleanType , arrow::matlab::type::proxy::PrimitiveCType); REGISTER_PROXY(arrow.type.proxy.StringType , arrow::matlab::type::proxy::StringType); REGISTER_PROXY(arrow.type.proxy.TimestampType , arrow::matlab::type::proxy::TimestampType); + REGISTER_PROXY(arrow.io.feather.proxy.FeatherWriter , arrow::matlab::io::feather::proxy::FeatherWriter); return libmexclass::error::Error{error::UNKNOWN_PROXY_ERROR_ID, "Did not find matching C++ proxy for " + class_name}; }; diff --git a/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.cc b/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.cc index ed30472f6c4a2..e159e926ec5ae 100644 --- a/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.cc +++ b/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.cc @@ -56,6 +56,10 @@ namespace arrow::matlab::tabular::proxy { REGISTER_METHOD(RecordBatch, getColumnByIndex); } + std::shared_ptr RecordBatch::unwrap() { + return record_batch; + } + void RecordBatch::toString(libmexclass::proxy::method::Context& context) { namespace mda = ::matlab::data; MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto utf16_string, arrow::util::UTF8StringToUTF16(record_batch->ToString()), context, error::UNICODE_CONVERSION_ERROR_ID); diff --git a/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.h b/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.h index b5d741060a15d..b8c038816b34e 100644 --- a/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.h +++ b/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.h @@ -29,6 +29,8 @@ namespace arrow::matlab::tabular::proxy { virtual ~RecordBatch() {} + std::shared_ptr unwrap(); + static libmexclass::proxy::MakeResult make(const libmexclass::proxy::FunctionArguments& constructor_arguments); protected: diff --git a/matlab/src/matlab/+arrow/+internal/+io/+feather/Writer.m b/matlab/src/matlab/+arrow/+internal/+io/+feather/Writer.m new file mode 100644 index 0000000000000..470c41fd5b23c --- /dev/null +++ b/matlab/src/matlab/+arrow/+internal/+io/+feather/Writer.m @@ -0,0 +1,48 @@ +%WRITER Class for writing feather V1 files. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. +classdef Writer < matlab.mixin.Scalar + + properties(Hidden, SetAccess=private, GetAccess=public) + Proxy + end + + properties(Dependent) + Filename + end + + methods + function obj = Writer(filename) + arguments + filename(1, 1) {mustBeNonmissing, mustBeNonzeroLengthText} + end + + args = struct(Filename=filename); + proxyName = "arrow.io.feather.proxy.FeatherWriter"; + obj.Proxy = arrow.internal.proxy.create(proxyName, args); + end + + function write(obj, T) + rb = arrow.recordbatch(T); + args = struct(RecordBatchProxyID=rb.Proxy.ID); + obj.Proxy.write(args); + end + + function filename = get.Filename(obj) + filename = obj.Proxy.getFilename(); + end + end +end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+tabular/RecordBatch.m b/matlab/src/matlab/+arrow/+tabular/RecordBatch.m index 0d002797f0121..be5eee7d89c35 100644 --- a/matlab/src/matlab/+arrow/+tabular/RecordBatch.m +++ b/matlab/src/matlab/+arrow/+tabular/RecordBatch.m @@ -23,7 +23,7 @@ ColumnNames end - properties (Access=protected) + properties (Hidden, SetAccess=private, GetAccess=public) Proxy end diff --git a/matlab/test/arrow/io/feather/tRoundTrip.m b/matlab/test/arrow/io/feather/tRoundTrip.m new file mode 100644 index 0000000000000..d56152be6d1c8 --- /dev/null +++ b/matlab/test/arrow/io/feather/tRoundTrip.m @@ -0,0 +1,52 @@ +%TROUNDTRIP Round trip tests for feather. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. +classdef tRoundTrip < matlab.unittest.TestCase + + methods(TestClassSetup) + % Delete once arrow.internal.io.feather.Reader is submitted. + function addFeatherFunctionsToMATLABPath(testCase) + import matlab.unittest.fixtures.PathFixture + % Add Feather test utilities to the MATLAB path. + testCase.applyFixture(PathFixture('../../../util')); + % arrow.cpp.call must be on the MATLAB path. + testCase.assertTrue(~isempty(which('arrow.cpp.call')), ... + '''arrow.cpp.call'' must be on the MATLAB path. Use ''addpath'' to add folders to the MATLAB path.'); + end + end + + methods(Test) + function Basic(testCase) + import matlab.unittest.fixtures.TemporaryFolderFixture + + fixture = testCase.applyFixture(TemporaryFolderFixture); + filename = fullfile(fixture.Folder, "temp.feather"); + + DoubleVar = [10; 20; 30; 40]; + SingleVar = single([10; 15; 20; 25]); + tWrite = table(DoubleVar, SingleVar); + + featherwrite(tWrite, filename); + tRead = featherread(filename); + testCase.verifyEqual(tWrite, tRead); + end + end +end + +function featherwrite(T, filename) + writer = arrow.internal.io.feather.Writer(filename); + writer.write(T); +end \ No newline at end of file diff --git a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake index f4696cfad26ee..1d57999417664 100644 --- a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake +++ b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake @@ -55,7 +55,9 @@ set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_SOURCES "${CMAKE_SOURCE_DIR}/src/cpp/a "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/string_type.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/timestamp_type.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/field.cc" - "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/wrap.cc") + "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/wrap.cc" + "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.cc") + set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_FACTORY_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/proxy") From 152be67100cdd367a3e6064988085e3b327ad0fe Mon Sep 17 00:00:00 2001 From: Kevin Gurney Date: Mon, 7 Aug 2023 16:26:16 -0400 Subject: [PATCH 101/749] GH-37041: [MATLAB] Implement Feather V1 Reader using new MATLAB Interface APIs (#37044) ### Rationale for this change Now that we've have the basic building blocks for tabular IO in the MATLAB Interface (Array, Schema, RecordBatch), we can implement a Feather V1 reader in terms of the new APIs. This is a follow up to #37043, where a new Feather V1 internal `Writer` object was added. ### What changes are included in this PR? 1. Added a new class called arrow.internal.io.feather.Reader which can be used to read Feather V1 files. It has one public property named `Filename` and one public method named `read`. **Example Usage:** ```matlab >> T = array2table(rand(3)) T = 3x3 table Var1 Var2 Var3 _______ ________ _______ 0.79221 0.035712 0.67874 0.95949 0.84913 0.75774 0.65574 0.93399 0.74313 >> filename = "test.feather"; >> featherwrite(filename, T) >> reader = arrow.internal.io.feather.Reader(filename) reader = Reader with properties: Filename: "test.feather" >> T = reader.read() T = 3x3 table Var1 Var2 Var3 _______ ________ _______ 0.79221 0.035712 0.67874 0.95949 0.84913 0.75774 0.65574 0.93399 0.74313 ``` ### Are these changes tested? Yes. 1. Added `Reader` to `feather/tRoundTrip.m`. ### Are there any user-facing changes? No. These are only internal objects right now. ### Future Directions 1. Re-implement `featherread` in terms of the new `Reader` object. 2. Remove legacy feather code and infrastructure. ### Notes 1. For conciseness, I renamed the C++ Proxy class `FeatherWriter` to `Writer` since it is already inside of a `feather` namespace / "package". * Closes: #37041 Authored-by: Kevin Gurney Signed-off-by: Kevin Gurney --- matlab/src/cpp/arrow/matlab/error/error.h | 6 ++ .../arrow/matlab/io/feather/proxy/reader.cc | 98 +++++++++++++++++++ .../arrow/matlab/io/feather/proxy/reader.h | 39 ++++++++ .../proxy/{feather_writer.cc => writer.cc} | 16 +-- .../proxy/{feather_writer.h => writer.h} | 6 +- matlab/src/cpp/arrow/matlab/proxy/factory.cc | 6 +- .../+arrow/+internal/+io/+feather/Reader.m | 52 ++++++++++ .../+arrow/+internal/+io/+feather/Writer.m | 4 +- matlab/test/arrow/io/feather/tRoundTrip.m | 5 + .../cmake/BuildMatlabArrowInterface.cmake | 4 +- 10 files changed, 219 insertions(+), 17 deletions(-) create mode 100644 matlab/src/cpp/arrow/matlab/io/feather/proxy/reader.cc create mode 100644 matlab/src/cpp/arrow/matlab/io/feather/proxy/reader.h rename matlab/src/cpp/arrow/matlab/io/feather/proxy/{feather_writer.cc => writer.cc} (86%) rename matlab/src/cpp/arrow/matlab/io/feather/proxy/{feather_writer.h => writer.h} (89%) create mode 100644 matlab/src/matlab/+arrow/+internal/+io/+feather/Reader.m diff --git a/matlab/src/cpp/arrow/matlab/error/error.h b/matlab/src/cpp/arrow/matlab/error/error.h index e1d2982f282dd..deac5e26fc1c0 100644 --- a/matlab/src/cpp/arrow/matlab/error/error.h +++ b/matlab/src/cpp/arrow/matlab/error/error.h @@ -181,7 +181,13 @@ namespace arrow::matlab::error { static const char* RECORD_BATCH_NUMERIC_INDEX_WITH_EMPTY_RECORD_BATCH = "arrow:tabular:recordbatch:NumericIndexWithEmptyRecordBatch"; static const char* RECORD_BATCH_INVALID_NUMERIC_COLUMN_INDEX = "arrow:tabular:recordbatch:InvalidNumericColumnIndex"; static const char* FAILED_TO_OPEN_FILE_FOR_WRITE = "arrow:io:FailedToOpenFileForWrite"; + static const char* FAILED_TO_OPEN_FILE_FOR_READ = "arrow:io:FailedToOpenFileForRead"; static const char* FEATHER_FAILED_TO_WRITE_TABLE = "arrow:io:feather:FailedToWriteTable"; static const char* TABLE_FROM_RECORD_BATCH = "arrow:table:FromRecordBatch"; + static const char* FEATHER_FAILED_TO_CREATE_READER = "arrow:io:feather:FailedToCreateReader"; + static const char* FEATHER_VERSION_2 = "arrow:io:feather:FeatherVersion2"; + static const char* FEATHER_VERSION_UNKNOWN = "arrow:io:feather:FeatherVersionUnknown"; + static const char* FEATHER_FAILED_TO_READ_TABLE = "arrow:io:feather:FailedToReadTable"; + static const char* FEATHER_FAILED_TO_READ_RECORD_BATCH = "arrow:io:feather:FailedToReadRecordBatch"; } diff --git a/matlab/src/cpp/arrow/matlab/io/feather/proxy/reader.cc b/matlab/src/cpp/arrow/matlab/io/feather/proxy/reader.cc new file mode 100644 index 0000000000000..a264d24ecb1bd --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/io/feather/proxy/reader.cc @@ -0,0 +1,98 @@ +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "libmexclass/proxy/ProxyManager.h" + +#include "arrow/matlab/error/error.h" +#include "arrow/matlab/io/feather/proxy/reader.h" +#include "arrow/matlab/tabular/proxy/record_batch.h" + +#include "arrow/util/utf8.h" + +#include "arrow/result.h" + +#include "arrow/io/file.h" +#include "arrow/ipc/feather.h" +#include "arrow/table.h" + +namespace arrow::matlab::io::feather::proxy { + + Reader::Reader(const std::string& filename) : filename{filename} { + REGISTER_METHOD(Reader, read); + REGISTER_METHOD(Reader, getFilename); + } + + libmexclass::proxy::MakeResult Reader::make(const libmexclass::proxy::FunctionArguments& constructor_arguments) { + namespace mda = ::matlab::data; + using ReaderProxy = arrow::matlab::io::feather::proxy::Reader; + + mda::StructArray args = constructor_arguments[0]; + const mda::StringArray filename_utf16_mda = args[0]["Filename"]; + const auto filename_utf16 = std::u16string(filename_utf16_mda[0]); + MATLAB_ASSIGN_OR_ERROR(const auto filename, arrow::util::UTF16StringToUTF8(filename_utf16), error::UNICODE_CONVERSION_ERROR_ID); + + return std::make_shared(filename); + } + + void Reader::read(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + using namespace libmexclass::proxy; + using RecordBatchProxy = arrow::matlab::tabular::proxy::RecordBatch; + + mda::ArrayFactory factory; + + // Create a file input stream. + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto source, arrow::io::ReadableFile::Open(filename, arrow::default_memory_pool()), context, error::FAILED_TO_OPEN_FILE_FOR_READ); + + // Create a Reader from the file input stream. + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto reader, arrow::ipc::feather::Reader::Open(source), context, error::FEATHER_FAILED_TO_CREATE_READER); + + // Error if not Feather V1. + const auto version = reader->version(); + if (version == ipc::feather::kFeatherV2Version) { + MATLAB_ERROR_IF_NOT_OK_WITH_CONTEXT(Status::NotImplemented("Support for Feather V2 has not been implemented."), context, error::FEATHER_VERSION_2); + } else if (version != ipc::feather::kFeatherV1Version) { + MATLAB_ERROR_IF_NOT_OK_WITH_CONTEXT(Status::Invalid("Unknown Feather format version."), context, error::FEATHER_VERSION_UNKNOWN); + } + + // Read a Table from the file. + std::shared_ptr table = nullptr; + MATLAB_ERROR_IF_NOT_OK_WITH_CONTEXT(reader->Read(&table), context, error::FEATHER_FAILED_TO_READ_TABLE); + + // Get the first RecordBatch from the Table. + arrow::TableBatchReader table_batch_reader{table}; + std::shared_ptr record_batch = nullptr; + MATLAB_ERROR_IF_NOT_OK_WITH_CONTEXT(table_batch_reader.ReadNext(&record_batch), context, error::FEATHER_FAILED_TO_READ_RECORD_BATCH); + + // Create a Proxy from the first RecordBatch. + auto record_batch_proxy = std::make_shared(record_batch); + const auto record_batch_proxy_id = ProxyManager::manageProxy(record_batch_proxy); + + const auto record_batch_proxy_id_mda = factory.createScalar(record_batch_proxy_id); + + context.outputs[0] = record_batch_proxy_id_mda; + } + + void Reader::getFilename(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + mda::ArrayFactory factory; + + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto filename_utf16, arrow::util::UTF8StringToUTF16(filename), context, error::UNICODE_CONVERSION_ERROR_ID); + auto filename_utf16_mda = factory.createScalar(filename_utf16); + context.outputs[0] = filename_utf16_mda; + } + +} diff --git a/matlab/src/cpp/arrow/matlab/io/feather/proxy/reader.h b/matlab/src/cpp/arrow/matlab/io/feather/proxy/reader.h new file mode 100644 index 0000000000000..fb6c06de8638d --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/io/feather/proxy/reader.h @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "libmexclass/proxy/Proxy.h" + +namespace arrow::matlab::io::feather::proxy { + + class Reader : public libmexclass::proxy::Proxy { + public: + Reader(const std::string& filename); + + virtual ~Reader() {} + + static libmexclass::proxy::MakeResult make(const libmexclass::proxy::FunctionArguments& constructor_arguments); + + protected: + void read(libmexclass::proxy::method::Context& context); + void getFilename(libmexclass::proxy::method::Context& context); + + const std::string filename; + }; + +} diff --git a/matlab/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.cc b/matlab/src/cpp/arrow/matlab/io/feather/proxy/writer.cc similarity index 86% rename from matlab/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.cc rename to matlab/src/cpp/arrow/matlab/io/feather/proxy/writer.cc index a27e1fb0e623a..c71c9ae7a514e 100644 --- a/matlab/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.cc +++ b/matlab/src/cpp/arrow/matlab/io/feather/proxy/writer.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/matlab/io/feather/proxy/feather_writer.h" +#include "arrow/matlab/io/feather/proxy/writer.h" #include "arrow/matlab/tabular/proxy/record_batch.h" #include "arrow/matlab/error/error.h" @@ -30,12 +30,12 @@ namespace arrow::matlab::io::feather::proxy { - FeatherWriter::FeatherWriter(const std::string& filename) : filename{filename} { - REGISTER_METHOD(FeatherWriter, getFilename); - REGISTER_METHOD(FeatherWriter, write); + Writer::Writer(const std::string& filename) : filename{filename} { + REGISTER_METHOD(Writer, getFilename); + REGISTER_METHOD(Writer, write); } - libmexclass::proxy::MakeResult FeatherWriter::make(const libmexclass::proxy::FunctionArguments& constructor_arguments) { + libmexclass::proxy::MakeResult Writer::make(const libmexclass::proxy::FunctionArguments& constructor_arguments) { namespace mda = ::matlab::data; mda::StructArray opts = constructor_arguments[0]; const mda::StringArray filename_mda = opts[0]["Filename"]; @@ -45,10 +45,10 @@ namespace arrow::matlab::io::feather::proxy { arrow::util::UTF16StringToUTF8(filename_utf16), error::UNICODE_CONVERSION_ERROR_ID); - return std::make_shared(filename_utf8); + return std::make_shared(filename_utf8); } - void FeatherWriter::getFilename(libmexclass::proxy::method::Context& context) { + void Writer::getFilename(libmexclass::proxy::method::Context& context) { namespace mda = ::matlab::data; MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto utf16_filename, arrow::util::UTF8StringToUTF16(filename), @@ -59,7 +59,7 @@ namespace arrow::matlab::io::feather::proxy { context.outputs[0] = str_mda; } - void FeatherWriter::write(libmexclass::proxy::method::Context& context) { + void Writer::write(libmexclass::proxy::method::Context& context) { namespace mda = ::matlab::data; mda::StructArray opts = context.inputs[0]; const mda::TypedArray record_batch_proxy_id_mda = opts[0]["RecordBatchProxyID"]; diff --git a/matlab/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.h b/matlab/src/cpp/arrow/matlab/io/feather/proxy/writer.h similarity index 89% rename from matlab/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.h rename to matlab/src/cpp/arrow/matlab/io/feather/proxy/writer.h index dadb479887891..21dc70f432a55 100644 --- a/matlab/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.h +++ b/matlab/src/cpp/arrow/matlab/io/feather/proxy/writer.h @@ -23,11 +23,11 @@ namespace arrow::matlab::io::feather::proxy { - class FeatherWriter : public libmexclass::proxy::Proxy { + class Writer : public libmexclass::proxy::Proxy { public: - FeatherWriter(const std::string& filename); + Writer(const std::string& filename); - ~FeatherWriter() {} + ~Writer() {} static libmexclass::proxy::MakeResult make(const libmexclass::proxy::FunctionArguments& constructor_arguments); diff --git a/matlab/src/cpp/arrow/matlab/proxy/factory.cc b/matlab/src/cpp/arrow/matlab/proxy/factory.cc index 7a2a4f3192fa1..bce875bb9f184 100644 --- a/matlab/src/cpp/arrow/matlab/proxy/factory.cc +++ b/matlab/src/cpp/arrow/matlab/proxy/factory.cc @@ -25,7 +25,8 @@ #include "arrow/matlab/type/proxy/string_type.h" #include "arrow/matlab/type/proxy/timestamp_type.h" #include "arrow/matlab/type/proxy/field.h" -#include "arrow/matlab/io/feather/proxy/feather_writer.h" +#include "arrow/matlab/io/feather/proxy/writer.h" +#include "arrow/matlab/io/feather/proxy/reader.h" #include "factory.h" @@ -61,7 +62,8 @@ libmexclass::proxy::MakeResult Factory::make_proxy(const ClassName& class_name, REGISTER_PROXY(arrow.type.proxy.BooleanType , arrow::matlab::type::proxy::PrimitiveCType); REGISTER_PROXY(arrow.type.proxy.StringType , arrow::matlab::type::proxy::StringType); REGISTER_PROXY(arrow.type.proxy.TimestampType , arrow::matlab::type::proxy::TimestampType); - REGISTER_PROXY(arrow.io.feather.proxy.FeatherWriter , arrow::matlab::io::feather::proxy::FeatherWriter); + REGISTER_PROXY(arrow.io.feather.proxy.Writer , arrow::matlab::io::feather::proxy::Writer); + REGISTER_PROXY(arrow.io.feather.proxy.Reader , arrow::matlab::io::feather::proxy::Reader); return libmexclass::error::Error{error::UNKNOWN_PROXY_ERROR_ID, "Did not find matching C++ proxy for " + class_name}; }; diff --git a/matlab/src/matlab/+arrow/+internal/+io/+feather/Reader.m b/matlab/src/matlab/+arrow/+internal/+io/+feather/Reader.m new file mode 100644 index 0000000000000..80da7294d2d8d --- /dev/null +++ b/matlab/src/matlab/+arrow/+internal/+io/+feather/Reader.m @@ -0,0 +1,52 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef Reader +%READER An internal Reader object for reading Feather files. + + properties (GetAccess=public, SetAccess=private, Hidden) + Proxy + end + + properties (Dependent, SetAccess=private, GetAccess=public) + % Name of the file to read. + Filename + end + + methods + + function obj = Reader(filename) + arguments + filename(1, 1) {mustBeNonmissing, mustBeNonzeroLengthText} + end + + args = struct(Filename=filename); + obj.Proxy = arrow.internal.proxy.create("arrow.io.feather.proxy.Reader", args); + end + + function T = read(obj) + recordBatchProxyID = obj.Proxy.read(); + proxy = libmexclass.proxy.Proxy(Name="arrow.tabular.proxy.RecordBatch", ID=recordBatchProxyID); + recordBatch = arrow.tabular.RecordBatch(proxy); + T = recordBatch.toMATLAB(); + end + + function filename = get.Filename(obj) + filename = obj.Proxy.getFilename(); + end + + end + +end diff --git a/matlab/src/matlab/+arrow/+internal/+io/+feather/Writer.m b/matlab/src/matlab/+arrow/+internal/+io/+feather/Writer.m index 470c41fd5b23c..37c785f10a5e3 100644 --- a/matlab/src/matlab/+arrow/+internal/+io/+feather/Writer.m +++ b/matlab/src/matlab/+arrow/+internal/+io/+feather/Writer.m @@ -31,7 +31,7 @@ end args = struct(Filename=filename); - proxyName = "arrow.io.feather.proxy.FeatherWriter"; + proxyName = "arrow.io.feather.proxy.Writer"; obj.Proxy = arrow.internal.proxy.create(proxyName, args); end @@ -45,4 +45,4 @@ function write(obj, T) filename = obj.Proxy.getFilename(); end end -end \ No newline at end of file +end diff --git a/matlab/test/arrow/io/feather/tRoundTrip.m b/matlab/test/arrow/io/feather/tRoundTrip.m index d56152be6d1c8..e735d196c1875 100644 --- a/matlab/test/arrow/io/feather/tRoundTrip.m +++ b/matlab/test/arrow/io/feather/tRoundTrip.m @@ -49,4 +49,9 @@ function Basic(testCase) function featherwrite(T, filename) writer = arrow.internal.io.feather.Writer(filename); writer.write(T); +end + +function T = featherread(filename) + reader = arrow.internal.io.feather.Reader(filename); + T = reader.read(); end \ No newline at end of file diff --git a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake index 1d57999417664..c19740f181444 100644 --- a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake +++ b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake @@ -56,8 +56,8 @@ set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_SOURCES "${CMAKE_SOURCE_DIR}/src/cpp/a "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/timestamp_type.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/field.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/wrap.cc" - "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.cc") - + "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/io/feather/proxy/writer.cc" + "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/io/feather/proxy/reader.cc") set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_FACTORY_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/proxy") From f549bf52d55228298e9007fe0f3640daa70f5dd6 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 8 Aug 2023 05:30:13 +0900 Subject: [PATCH 102/749] MINOR: [C#] Bump xunit.runner.visualstudio from 2.4.0 to 2.5.0 in /csharp (#36728) Bumps [xunit.runner.visualstudio](https://github.com/xunit/visualstudio.xunit) from 2.4.0 to 2.5.0.
Release notes

Sourced from xunit.runner.visualstudio's releases.

v2.4.5

Changes:

  • #308: Downgrade xunit.runner.vs to net462

This list of changes was auto generated.

v2.4.4

Changes:

Bugs:

  • #292: Fixing a hang in test discovery

Others:

  • #304: Update xunit version to 2.4.2-pre.12
  • #297: Bump NSubstitute from 4.2.2 to 4.3.0
  • #294: Bump Microsoft.NET.Test.Sdk from 16.10.0 to 17.0.0
  • #293: Bump Nerdbank.GitVersioning from 3.4.231 to 3.4.255
  • #296: Bump Microsoft.NETCore.UniversalWindowsPlatform from 6.2.12 to 6.2.13
  • #299: Bump MSBuild.Sdk.Extras from 3.0.23 to 3.0.44
  • #270: Upgrade to GitHub-native Dependabot
  • #281: Bump Nerdbank.GitVersioning from 3.4.228 to 3.4.231
  • #280: Bump Nerdbank.GitVersioning from 3.4.220 to 3.4.228
  • #277: Bump Nerdbank.GitVersioning from 3.4.216 to 3.4.220
  • #275: Bump Nerdbank.GitVersioning from 3.4.205 to 3.4.216
  • #274: Bump Nerdbank.GitVersioning from 3.4.203 to 3.4.205
  • #273: Bump Microsoft.NET.Test.Sdk from 16.9.4 to 16.10.0
  • #272: Bump Microsoft.TestPlatform.ObjectModel from 16.9.4 to 16.10.0
  • #271: Bump Nerdbank.GitVersioning from 3.4.194 to 3.4.203
  • #268: Bump Nerdbank.GitVersioning from 3.4.190 to 3.4.194
  • #267: Bump Nerdbank.GitVersioning from 3.3.37 to 3.4.190
  • #265: Bump Microsoft.NET.Test.Sdk from 16.9.1 to 16.9.4
  • #264: Bump Microsoft.TestPlatform.ObjectModel from 16.9.1 to 16.9.4
  • #260: Bump Microsoft.TestPlatform.ObjectModel from 16.8.3 to 16.9.1
  • #261: Bump Microsoft.NET.Test.Sdk from 16.8.3 to 16.9.1
  • #258: Bump Microsoft.NETCore.UniversalWindowsPlatform from 6.2.11 to 6.2.12
  • #256: Bump MSBuild.Sdk.Extras from 3.0.22 to 3.0.23
  • #255: Bump MSBuild.Sdk.Extras from 2.1.2 to 3.0.22
  • #254: Bump Microsoft.TestPlatform.ObjectModel from 16.8.0 to 16.8.3

... (truncated)

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=xunit.runner.visualstudio&package-manager=nuget&previous-version=2.4.0&new-version=2.5.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- .../Apache.Arrow.Compression.Tests.csproj | 2 +- .../Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj | 2 +- csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj index 7c1f3a77a4c18..145865a2df46a 100644 --- a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj +++ b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj @@ -9,7 +9,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj index 2b150955d02b7..cd42bc9d1724e 100644 --- a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj @@ -8,7 +8,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj index 805fb5ab3acce..981c359cab174 100644 --- a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj +++ b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj @@ -12,7 +12,7 @@ - + all runtime; build; native; contentfiles; analyzers From e90c3160301351d68e19c5f76f001ba5cc71bdfb Mon Sep 17 00:00:00 2001 From: sgilmore10 <74676073+sgilmore10@users.noreply.github.com> Date: Mon, 7 Aug 2023 16:38:43 -0400 Subject: [PATCH 103/749] GH-37045: [MATLAB] Implement featherwrite in terms of arrow.internal.io.feather.Writer (#37047) ### Rationale for this change Now that #37043 is merged, we can re-implement `featherwrite` in terms of the new `arrow.internal.io.feather.Writer` class. Once this change is made, we can delete the legacy build infrastructure and featherwrite MEX code. ### What changes are included in this PR? 1. Re-implemented `featherwrite` using `arrow.internal.io.feather.Writer`. ### Are these changes tested? 1. Yes, the existing tests in `tfeather.m` cover these changes. 2. I had to update some of the expected error message IDs in `tfeather.m` because the new implementation throws errors with different IDs. 3. `featherwrite` used to export the real part of MATLAB complex numeric arrays. The new version of `featherwrite` now errors if the input table contains complex data because feather/Arrow itself does not support complex numeric data. We think this is the right decision. Writing out only the real part is lossy. ### Are there any user-facing changes? Yes, `featherwrite` no longer supports writing complex numeric arrays. ### Future Directions 1. Once this PR is merged, we will remove the legacy build infrastructure and MEX code. * Closes: #37045 Authored-by: Sarah Gilmore Signed-off-by: Kevin Gurney --- matlab/src/matlab/featherwrite.m | 22 ++++++---------------- matlab/test/tfeather.m | 11 +++++------ 2 files changed, 11 insertions(+), 22 deletions(-) diff --git a/matlab/src/matlab/featherwrite.m b/matlab/src/matlab/featherwrite.m index c59f7f42368eb..cc3f45e954ad8 100644 --- a/matlab/src/matlab/featherwrite.m +++ b/matlab/src/matlab/featherwrite.m @@ -23,21 +23,11 @@ function featherwrite(filename, t) % specific language governing permissions and limitations % under the License. -import arrow.util.table2mlarrow; + arguments + filename(1, 1) string {mustBeNonmissing, mustBeNonzeroLengthText} + t table + end -% Validate input arguments. -narginchk(2, 2); -filename = convertStringsToChars(filename); -if ~ischar(filename) - error('MATLAB:arrow:InvalidFilenameDatatype', ... - 'Filename must be a character vector or string scalar.'); -end -if ~istable(t) - error('MATLAB:arrow:InvalidInputTable', 't must be a table.'); -end - -[variables, metadata] = table2mlarrow(t); - -% Write the table to a Feather file. -arrow.cpp.call('featherwrite', filename, variables, metadata); + writer = arrow.internal.io.feather.Writer(filename); + writer.write(t); end diff --git a/matlab/test/tfeather.m b/matlab/test/tfeather.m index a32b78fdcd671..e4c988e1dda49 100755 --- a/matlab/test/tfeather.m +++ b/matlab/test/tfeather.m @@ -164,7 +164,7 @@ function ErrorIfInvalidFilenameDatatype(testCase) t = createTable; - testCase.verifyError(@() featherwrite({filename}, t), 'MATLAB:arrow:InvalidFilenameDatatype'); + testCase.verifyError(@() featherwrite({table}, t), 'MATLAB:validation:UnableToConvert'); testCase.verifyError(@() featherread({filename}), 'MATLAB:arrow:InvalidFilenameDatatype'); end @@ -178,7 +178,7 @@ function ErrorIfTooManyInputs(testCase) end function ErrorIfTooFewInputs(testCase) - testCase.verifyError(@() featherwrite(), 'MATLAB:narginchk:notEnoughInputs'); + testCase.verifyError(@() featherwrite(), 'MATLAB:minrhs'); testCase.verifyError(@() featherread(), 'MATLAB:narginchk:notEnoughInputs'); end @@ -193,7 +193,7 @@ function ErrorIfMultiColVarExist(testCase) t = table(age, smoker, height, weight, bloodPressure); - testCase.verifyError(@() featherwrite(filename, t), 'MATLAB:arrow:UnsupportedVariableType'); + testCase.verifyError(@() featherwrite(filename, t), 'arrow:array:InvalidShape'); end function UnsupportedMATLABDatatypes(testCase) @@ -205,7 +205,7 @@ function UnsupportedMATLABDatatypes(testCase) calendarDuration(5, 3, 2)]; actualTable = addvars(actualTable, calendarDurationVariable); - testCase.verifyError(@() featherwrite(filename, actualTable) ,'MATLAB:arrow:UnsupportedVariableType'); + testCase.verifyError(@() featherwrite(filename, actualTable) ,'arrow:array:UnsupportedMATLABType'); end function NumericComplexUnsupported(testCase) @@ -216,8 +216,7 @@ function NumericComplexUnsupported(testCase) actualTable.double(2) = exp(9) + 5i; actualTable.int64(2) = 1.0418e+03; - expectedTable = featherRoundTrip(filename, actualTable); - testCase.verifyNotEqual(actualTable, expectedTable); + testCase.verifyError(@() featherwrite(filename, actualTable) ,'arrow:array:ComplexNumeric'); end end end From f055f5e554020c10c343249e95b940a751eb3239 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Mon, 7 Aug 2023 21:48:29 +0100 Subject: [PATCH 104/749] GH-36886: [C++] Configure `azurite` in preparation for testing Azure C++ filesystem (#36988) ### Rationale for this change We need to write tests for https://github.com/apache/arrow/issues/18014. azurite is like a fake Azure blob storage so it can be used to write integration tests ### What changes are included in this PR? Extract the `azurite` related changes from https://github.com/apache/arrow/pull/12914 to create a smaller PR that's easier to review. I have made very minimal changes compared to that PR. Currently `azurite` is configured for all the environments where `ARROW_AZURE` was enabled by https://github.com/apache/arrow/pull/35701. I assume its deliberate that its not enabled yet for windows, alpine, conda, debian or fedora builds. ### Are these changes tested? Its tested by there aren't really any good tests in this PR. I used this `azurite` config in https://github.com/apache/arrow/pull/36835 to make an integration test that uses the Azure C++ SDK. On its own we can't really write tests for this `azurite` setup PR. ### Are there any user-facing changes? No * Closes: #36886 Lead-authored-by: Thomas Newton Co-authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- .github/workflows/cpp.yml | 4 ++ ci/docker/ubuntu-20.04-cpp.dockerfile | 4 ++ ci/docker/ubuntu-22.04-cpp.dockerfile | 4 ++ ci/scripts/install_azurite.sh | 37 +++++++++++ cpp/Brewfile | 1 + cpp/src/arrow/filesystem/azurefs_test.cc | 82 +++++++++++++++++++++++- 6 files changed, 131 insertions(+), 1 deletion(-) create mode 100755 ci/scripts/install_azurite.sh diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index cd12be11488bb..eaccf25403eba 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -23,6 +23,7 @@ on: - '.github/workflows/cpp.yml' - 'ci/docker/**' - 'ci/scripts/cpp_*' + - 'ci/scripts/install_azurite.sh' - 'ci/scripts/install_gcs_testbench.sh' - 'ci/scripts/install_minio.sh' - 'ci/scripts/msys2_*' @@ -34,6 +35,7 @@ on: - '.github/workflows/cpp.yml' - 'ci/docker/**' - 'ci/scripts/cpp_*' + - 'ci/scripts/install_azurite.sh' - 'ci/scripts/install_gcs_testbench.sh' - 'ci/scripts/install_minio.sh' - 'ci/scripts/msys2_*' @@ -201,6 +203,8 @@ jobs: ci/scripts/install_minio.sh latest /usr/local - name: Install Google Cloud Storage Testbench run: ci/scripts/install_gcs_testbench.sh default + - name: Install Azurite Storage Emulator + run: ci/scripts/install_azurite.sh - name: Setup ccache run: | ci/scripts/ccache_setup.sh diff --git a/ci/docker/ubuntu-20.04-cpp.dockerfile b/ci/docker/ubuntu-20.04-cpp.dockerfile index f94494177e8ee..125f1f48d482e 100644 --- a/ci/docker/ubuntu-20.04-cpp.dockerfile +++ b/ci/docker/ubuntu-20.04-cpp.dockerfile @@ -103,6 +103,7 @@ RUN apt-get update -y -q && \ make \ ninja-build \ nlohmann-json3-dev \ + npm \ pkg-config \ protobuf-compiler \ python3-dev \ @@ -123,6 +124,9 @@ RUN /arrow/ci/scripts/install_minio.sh latest /usr/local COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_gcs_testbench.sh default +COPY ci/scripts/install_azurite.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_azurite.sh + COPY ci/scripts/install_ceph.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_ceph.sh diff --git a/ci/docker/ubuntu-22.04-cpp.dockerfile b/ci/docker/ubuntu-22.04-cpp.dockerfile index e773c6f1ee659..0840b3fa5c68d 100644 --- a/ci/docker/ubuntu-22.04-cpp.dockerfile +++ b/ci/docker/ubuntu-22.04-cpp.dockerfile @@ -102,6 +102,7 @@ RUN apt-get update -y -q && \ make \ ninja-build \ nlohmann-json3-dev \ + npm \ pkg-config \ protobuf-compiler \ protobuf-compiler-grpc \ @@ -153,6 +154,9 @@ RUN /arrow/ci/scripts/install_minio.sh latest /usr/local COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_gcs_testbench.sh default +COPY ci/scripts/install_azurite.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_azurite.sh + COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin diff --git a/ci/scripts/install_azurite.sh b/ci/scripts/install_azurite.sh new file mode 100755 index 0000000000000..2e7008360fdc3 --- /dev/null +++ b/ci/scripts/install_azurite.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e + +case "$(uname)" in + Darwin) + npm install -g azurite + which azurite + ;; + MINGW*) + choco install nodejs.install + npm install -g azurite + ;; + Linux) + npm install -g azurite + which azurite + ;; +esac +echo "node version = $(node --version)" +echo "azurite version = $(azurite --version)" \ No newline at end of file diff --git a/cpp/Brewfile b/cpp/Brewfile index 580e8d3f115d5..58015d2121b5b 100644 --- a/cpp/Brewfile +++ b/cpp/Brewfile @@ -30,6 +30,7 @@ brew "grpc" brew "llvm@14" brew "lz4" brew "ninja" +brew "node" brew "openssl@3" brew "protobuf" brew "python" diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 0f03e88393aeb..e940c5bd1bc32 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -15,7 +15,26 @@ // specific language governing permissions and limitations // under the License. +#include // Missing include in boost/process + +// This boost/asio/io_context.hpp include is needless for no MinGW +// build. +// +// This is for including boost/asio/detail/socket_types.hpp before any +// "#include ". boost/asio/detail/socket_types.hpp doesn't +// work if windows.h is already included. boost/process.h -> +// boost/process/args.hpp -> boost/process/detail/basic_cmd.hpp +// includes windows.h. boost/process/args.hpp is included before +// boost/process/async.h that includes +// boost/asio/detail/socket_types.hpp implicitly is included. +#include +// We need BOOST_USE_WINDOWS_H definition with MinGW when we use +// boost/process.hpp. See BOOST_USE_WINDOWS_H=1 in +// cpp/cmake_modules/ThirdpartyToolchain.cmake for details. +#include + #include "arrow/filesystem/azurefs.h" +#include "arrow/util/io_util.h" #include #include @@ -27,15 +46,76 @@ #include "arrow/testing/util.h" namespace arrow { +using internal::TemporaryDir; namespace fs { namespace { +namespace bp = boost::process; using ::testing::IsEmpty; using ::testing::Not; using ::testing::NotNull; -// Placeholder test for file structure +class AzuriteEnv : public ::testing::Environment { + public: + AzuriteEnv() { + account_name_ = "devstoreaccount1"; + account_key_ = + "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/" + "KBHBeksoGMGw=="; + auto exe_path = bp::search_path("azurite"); + if (exe_path.empty()) { + auto error = std::string("Could not find Azurite emulator."); + status_ = Status::Invalid(error); + return; + } + auto temp_dir_ = *TemporaryDir::Make("azurefs-test-"); + server_process_ = bp::child(boost::this_process::environment(), exe_path, "--silent", + "--location", temp_dir_->path().ToString(), "--debug", + temp_dir_->path().ToString() + "/debug.log"); + if (!(server_process_.valid() && server_process_.running())) { + auto error = "Could not start Azurite emulator."; + server_process_.terminate(); + server_process_.wait(); + status_ = Status::Invalid(error); + return; + } + status_ = Status::OK(); + } + + ~AzuriteEnv() override { + server_process_.terminate(); + server_process_.wait(); + } + + const std::string& account_name() const { return account_name_; } + const std::string& account_key() const { return account_key_; } + const Status status() const { return status_; } + + private: + std::string account_name_; + std::string account_key_; + bp::child server_process_; + Status status_; + std::unique_ptr temp_dir_; +}; + +auto* azurite_env = ::testing::AddGlobalTestEnvironment(new AzuriteEnv); + +AzuriteEnv* GetAzuriteEnv() { + return ::arrow::internal::checked_cast(azurite_env); +} + +// Placeholder tests for file structure // TODO: GH-18014 Remove once a proper test is added +TEST(AzureFileSystem, InitialiseAzurite) { + const std::string& account_name = GetAzuriteEnv()->account_name(); + const std::string& account_key = GetAzuriteEnv()->account_key(); + EXPECT_EQ(account_name, "devstoreaccount1"); + EXPECT_EQ(account_key, + "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/" + "K1SZFPTOtr/KBHBeksoGMGw=="); +} + TEST(AzureFileSystem, OptionsCompare) { AzureOptions options; EXPECT_TRUE(options.Equals(options)); From 3937fdb5eefdf0b987dd4672a6051a1837c5fd05 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 8 Aug 2023 05:51:37 +0900 Subject: [PATCH 105/749] MINOR: [C#] Bump BenchmarkDotNet.Diagnostics.Windows from 0.13.5 to 0.13.7 in /csharp (#37035) Bumps [BenchmarkDotNet.Diagnostics.Windows](https://github.com/dotnet/BenchmarkDotNet) from 0.13.5 to 0.13.7.
Release notes

Sourced from BenchmarkDotNet.Diagnostics.Windows's releases.

BenchmarkDotNet v0.13.7

This release contains a bunch of important bug fixes.

Full changelog: https://benchmarkdotnet.org/changelog/v0.13.7.html

BenchmarkDotNet v0.13.6

Highlights

  • New BenchmarkDotNet.Diagnostics.dotTrace NuGet package. Once this package is installed, you can annotate your benchmarks with the [DotTraceDiagnoser] and get a dotTrace performance snapshot at the end of the benchmark run. #2328
  • Updated documentation website. We migrated to docfx 2.67 and got the refreshed modern template based on bootstrap 5 with dark/light theme switcher.
  • Updated BenchmarkDotNet.Templates. Multiple issues were resolved, now you can create new benchmark projects from terminal or your favorite IDE. #1658 #1881 #2149 #2338
  • Response file support. Now it's possible to pass additional arguments to BenchmarkDotNet using @ filename syntax. #2320 #2348
  • Custom runtime support. #2285
  • Introduce CategoryDiscoverer, see IntroCategoryDiscoverer. #2306 #2307
  • Multiple bug fixes.

Full changelog: https://benchmarkdotnet.org/changelog/v0.13.6.html

Commits
  • c02aeaa Prepare v0.13.7 changelog
  • fd5b766 Fix PlatformNotSupportedException thrown on Android in ConsoleTitler (#2390)
  • 914ee71 perfcollect: don't restore symbols for local builds (#2384)
  • 90ef2ae Fix debug build (#2385)
  • 6573422 Bump xUnit: 2.4.2->2.5.0
  • da02fdc Upgrade setup-node in spellcheck-docs workflow
  • a89ad26 Fix path in dotnet nuget push command
  • dc365e1 Add snupkg to nightly workflow artifacts
  • b912a8e Code cleanup
  • f8ab518 Enable nullable annotations for the whole solution
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=BenchmarkDotNet.Diagnostics.Windows&package-manager=nuget&previous-version=0.13.5&new-version=0.13.7)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Lead-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- .../Apache.Arrow.Benchmarks/Apache.Arrow.Benchmarks.csproj | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/csharp/test/Apache.Arrow.Benchmarks/Apache.Arrow.Benchmarks.csproj b/csharp/test/Apache.Arrow.Benchmarks/Apache.Arrow.Benchmarks.csproj index 06f42ac1c66ee..a81fc15bae861 100644 --- a/csharp/test/Apache.Arrow.Benchmarks/Apache.Arrow.Benchmarks.csproj +++ b/csharp/test/Apache.Arrow.Benchmarks/Apache.Arrow.Benchmarks.csproj @@ -6,8 +6,8 @@ - - + + From 3fdd78d7be4678e0806f688de518f508210da1cb Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 8 Aug 2023 05:52:16 +0900 Subject: [PATCH 106/749] MINOR: [C#] Bump coverlet.collector from 1.2.0 to 6.0.0 in /csharp (#36596) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [coverlet.collector](https://github.com/coverlet-coverage/coverlet) from 1.2.0 to 6.0.0.
Release notes

Sourced from coverlet.collector's releases.

v6.0.0

Fixed -Could not write lines to file CoverletSourceRootsMapping - in use by another process coverlet-coverage/coverlet#1155
-Incorrect coverage for methods returning IAsyncEnumerable in generic classes coverlet-coverage/coverlet#1383
-Wrong branch coverage for async methods .NET Standard 1.x coverlet-coverage/coverlet#1376
-Empty path exception in visual basic projects coverlet-coverage/coverlet#775
-Align published nuget package version to github release version coverlet-coverage/coverlet#1413
-Sync nuget and github release versions coverlet-coverage/coverlet#1122

Improvements -Migration of the project to .NET 6.0 coverlet-coverage/coverlet#1473

Breaking changes New parameter ExcludeAssembliesWithoutSources to control automatic assembly exclusion coverlet-coverage/coverlet#1164. The parameter InstrumentModulesWithoutLocalSources has been removed. since it can be handled by setting ExcludeAssembliesWithoutSources to None. The default heuristics for determining whether to instrument an assembly has been changed. In previous versions any missing source file was taken as a signal that it was a third-party project that shouldn't be instrumented, with exceptions for some common file name patterns for source generators. Now only assemblies where no source files at all can be found are excluded from instrumentation, and the code for detecting source generator files have been removed. To get back to the behaviour that at least one missing file is sufficient to exclude an assembly, set ExcludeAssembliesWithoutSources to MissingAny, or use assembly exclusion filters for more fine-grained control.

Diff between 3.2.0 and 6.0.0

v5.8.0

Fixed

-Fix TypeLoadException when referencing Microsoft.Extensions.DependencyInjection v6.0.1 #1390
-Source Link for code generators fails #1322
-Await foreach has wrong branch coverage when method is generic #1210
-ExcludeFromCodeCoverage attribute on local functions ignores lambda expression #1302

Added

-Added InstrumentModulesWithoutLocalSources setting #1360 by @​TFTomSun

Diff between 3.1.2 and 3.2.0

v5.7.2

Fixed -Fix CoreLib's coverage measurement is broken #1286 -Fix UnloadModule injection 1291

Diff between 3.1.1 and 3.1.2

v5.7.1

Fixed -Fix wrong branch coverage with EnumeratorCancellation attribute #1275 -Fix negative coverage exceeding int.MaxValue #1266 -Fix summary output format for culture de-DE #1263 -Fix branch coverage issue for finally block with await #1233 -Fix threshold doesn't work when coverage empty #1205 -Fix branch coverage issue for il switch #1177 -Fix branch coverage with using statement and several awaits#1176 -Fix CopyCoverletDataCollectorFiles to avoid to override user dlls for dotnet publish scenario #1243

Improvements -Improve logging in case of exception inside static ctor of NetstandardAwareAssemblyResolver #1230 -When collecting open the hitfile with read access #1214 by https://github.com/JamesWTruher

... (truncated)

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=coverlet.collector&package-manager=nuget&previous-version=1.2.0&new-version=6.0.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- .../Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj index cd42bc9d1724e..26ab18ca7d6ae 100644 --- a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj @@ -9,7 +9,7 @@ - +
From ceef8bbad4206af187ef2843254c135722076e32 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 8 Aug 2023 05:53:50 +0900 Subject: [PATCH 107/749] MINOR: [C#] Bump Microsoft.SourceLink.GitHub from 1.0.0 to 1.1.1 in /csharp (#37037) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [Microsoft.SourceLink.GitHub](https://github.com/dotnet/sourcelink) from 1.0.0 to 1.1.1.
Release notes

Sourced from Microsoft.SourceLink.GitHub's releases.

1.1.1

Notable Changes

New Contributors

Full Changelog: https://github.com/dotnet/sourcelink/compare/1.0.0...1.1.1

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=Microsoft.SourceLink.GitHub&package-manager=nuget&previous-version=1.0.0&new-version=1.1.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- csharp/src/Apache.Arrow/Apache.Arrow.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csharp/src/Apache.Arrow/Apache.Arrow.csproj b/csharp/src/Apache.Arrow/Apache.Arrow.csproj index 43d60ba0012ac..b05533c26b22c 100644 --- a/csharp/src/Apache.Arrow/Apache.Arrow.csproj +++ b/csharp/src/Apache.Arrow/Apache.Arrow.csproj @@ -16,7 +16,7 @@
- + From 56c1f3e14e4e77c927b6227291cfb794df0a0655 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 8 Aug 2023 05:55:16 +0900 Subject: [PATCH 108/749] MINOR: [JS] Bump word-wrap from 1.2.3 to 1.2.4 in /js (#36761) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [word-wrap](https://github.com/jonschlinkert/word-wrap) from 1.2.3 to 1.2.4.
Release notes

Sourced from word-wrap's releases.

1.2.4

What's Changed

New Contributors

Full Changelog: https://github.com/jonschlinkert/word-wrap/compare/1.2.3...1.2.4

Commits
  • f64b188 run verb to generate README
  • 03ea082 Merge pull request #42 from jonschlinkert/chore/publish-workflow
  • 420dce9 Merge pull request #41 from jonschlinkert/fix/CVE-2023-26115-2
  • bfa694e Update .github/workflows/publish.yml
  • ace0b3c chore: bump version to 1.2.4
  • 6fd7275 chore: add publish workflow
  • 30d6daf chore: fix test
  • 655929c chore: remove package-lock
  • 49e08bb chore: added an additional testcase
  • 9f62693 fix: cve 2023-26115
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=word-wrap&package-manager=npm_and_yarn&previous-version=1.2.3&new-version=1.2.4)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself) You can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/apache/arrow/network/alerts).
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- js/yarn.lock | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/js/yarn.lock b/js/yarn.lock index 512dd7e07a9ea..0343bc56f8e3f 100644 --- a/js/yarn.lock +++ b/js/yarn.lock @@ -9429,9 +9429,9 @@ wide-align@^1.1.5: string-width "^1.0.2 || 2 || 3 || 4" word-wrap@^1.2.3: - version "1.2.3" - resolved "https://registry.yarnpkg.com/word-wrap/-/word-wrap-1.2.3.tgz#610636f6b1f703891bd34771ccb17fb93b47079c" - integrity sha512-Hz/mrNwitNRh/HUAtM/VT/5VH+ygD6DV7mYKZAtHOrbs8U7lvPS6xf7EJKMF0uW1KJCl0H701g3ZGus+muE5vQ== + version "1.2.4" + resolved "https://registry.yarnpkg.com/word-wrap/-/word-wrap-1.2.4.tgz#cb4b50ec9aca570abd1f52f33cd45b6c61739a9f" + integrity sha512-2V81OA4ugVo5pRo46hAoD2ivUJx8jXmWXfUkY4KFNw0hEptvN0QfH3K4nHiwzGeKl5rFKedV48QVoqYavy4YpA== wordwrap@^1.0.0: version "1.0.0" @@ -9444,6 +9444,7 @@ wordwrapjs@^5.1.0: integrity sha512-JNjcULU2e4KJwUNv6CHgI46UvDGitb6dGryHajXTDiLgg1/RiGoPSDw4kZfYnwGtEXf2ZMeIewDQgFGzkCB2Sg== "wrap-ansi-cjs@npm:wrap-ansi@^7.0.0", wrap-ansi@^7.0.0: + name wrap-ansi-cjs version "7.0.0" resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-7.0.0.tgz#67e145cff510a6a6984bdf1152911d69d2eb9e43" integrity sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q== From feb10f357d824a1bdadbed59b82403da13b42d28 Mon Sep 17 00:00:00 2001 From: Kevin Gurney Date: Mon, 7 Aug 2023 17:19:33 -0400 Subject: [PATCH 109/749] GH-37049: [MATLAB] Update feather `Reader` and `Writer` objects to work directly with `arrow.tabular.RecordBatch`s instead of MATLAB `table`s (#37052) ### Rationale for this change After thinking about how to re-implement `featherread` and `featherwrite`, we realized it would be better if the `Reader` and `Writer` classes worked directly with `arrow.tabular.RecordBatch`s instead of MATLAB `table`s. ### What changes are included in this PR? 1. Updated `read` method of `arrow.internal.io.feather.Reader` to return an `arrow.tabular.RecordBatch` rather than a MATLAB `table`. 2. Updated `write` method of `arrow.internal.io.feather.Writer` to accept an `arrow.tabular.RecordBatch` rather than a MATLAB `table`. ### Are these changes tested? Yes. 1. Updated `feather/tRoundTrip.m` to reflect the changes to the `Reader` and `Writer` classes. ### Are there any user-facing changes? 1. No These are internal APIs. * Closes: #37049 Authored-by: Kevin Gurney Signed-off-by: Kevin Gurney --- .../+arrow/+internal/+io/+feather/Reader.m | 3 +- .../+arrow/+internal/+io/+feather/Writer.m | 5 ++-- matlab/src/matlab/featherwrite.m | 3 +- matlab/test/arrow/io/feather/tRoundTrip.m | 30 +++++++++---------- 4 files changed, 20 insertions(+), 21 deletions(-) diff --git a/matlab/src/matlab/+arrow/+internal/+io/+feather/Reader.m b/matlab/src/matlab/+arrow/+internal/+io/+feather/Reader.m index 80da7294d2d8d..6cd78646767a7 100644 --- a/matlab/src/matlab/+arrow/+internal/+io/+feather/Reader.m +++ b/matlab/src/matlab/+arrow/+internal/+io/+feather/Reader.m @@ -36,11 +36,10 @@ obj.Proxy = arrow.internal.proxy.create("arrow.io.feather.proxy.Reader", args); end - function T = read(obj) + function recordBatch = read(obj) recordBatchProxyID = obj.Proxy.read(); proxy = libmexclass.proxy.Proxy(Name="arrow.tabular.proxy.RecordBatch", ID=recordBatchProxyID); recordBatch = arrow.tabular.RecordBatch(proxy); - T = recordBatch.toMATLAB(); end function filename = get.Filename(obj) diff --git a/matlab/src/matlab/+arrow/+internal/+io/+feather/Writer.m b/matlab/src/matlab/+arrow/+internal/+io/+feather/Writer.m index 37c785f10a5e3..64872ba4a023c 100644 --- a/matlab/src/matlab/+arrow/+internal/+io/+feather/Writer.m +++ b/matlab/src/matlab/+arrow/+internal/+io/+feather/Writer.m @@ -35,9 +35,8 @@ obj.Proxy = arrow.internal.proxy.create(proxyName, args); end - function write(obj, T) - rb = arrow.recordbatch(T); - args = struct(RecordBatchProxyID=rb.Proxy.ID); + function write(obj, recordBatch) + args = struct(RecordBatchProxyID=recordBatch.Proxy.ID); obj.Proxy.write(args); end diff --git a/matlab/src/matlab/featherwrite.m b/matlab/src/matlab/featherwrite.m index cc3f45e954ad8..879edd8afc68e 100644 --- a/matlab/src/matlab/featherwrite.m +++ b/matlab/src/matlab/featherwrite.m @@ -28,6 +28,7 @@ function featherwrite(filename, t) t table end + recordBatch = arrow.recordbatch(t); writer = arrow.internal.io.feather.Writer(filename); - writer.write(t); + writer.write(recordBatch); end diff --git a/matlab/test/arrow/io/feather/tRoundTrip.m b/matlab/test/arrow/io/feather/tRoundTrip.m index e735d196c1875..f361a4543b8e4 100644 --- a/matlab/test/arrow/io/feather/tRoundTrip.m +++ b/matlab/test/arrow/io/feather/tRoundTrip.m @@ -31,27 +31,27 @@ function addFeatherFunctionsToMATLABPath(testCase) methods(Test) function Basic(testCase) import matlab.unittest.fixtures.TemporaryFolderFixture - + import arrow.internal.io.feather.* + fixture = testCase.applyFixture(TemporaryFolderFixture); filename = fullfile(fixture.Folder, "temp.feather"); DoubleVar = [10; 20; 30; 40]; SingleVar = single([10; 15; 20; 25]); - tWrite = table(DoubleVar, SingleVar); - - featherwrite(tWrite, filename); - tRead = featherread(filename); - testCase.verifyEqual(tWrite, tRead); + + tableWrite = table(DoubleVar, SingleVar); + recordBatchWrite = arrow.recordbatch(tableWrite); + + writer = Writer(filename); + writer.write(recordBatchWrite); + + reader = arrow.internal.io.feather.Reader(filename); + recordBatchRead = reader.read(); + + tableRead = table(recordBatchRead); + + testCase.verifyEqual(tableWrite, tableRead); end end -end - -function featherwrite(T, filename) - writer = arrow.internal.io.feather.Writer(filename); - writer.write(T); -end -function T = featherread(filename) - reader = arrow.internal.io.feather.Reader(filename); - T = reader.read(); end \ No newline at end of file From 178b055464f35f4aeef8d5ddf34fdc15ab5be264 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 8 Aug 2023 09:31:12 +0900 Subject: [PATCH 110/749] MINOR: [C++] Fix a lint failure (#37048) ### Rationale for this change GH-37031 had the following lint failure but I merged it. Sorry. ```text FAILED: CMakeFiles/check-format cd /tmp/arrow-lint-3lmfc4qt/cpp-build && /usr/local/bin/python /arrow/cpp/build-support/run_clang_format.py --clang_format_binary /usr/bin/clang-format-14 --exclude_globs /arrow/cpp/build-support/lint_exclusions.txt --source_dir /arrow/cpp/src --source_dir /arrow/cpp/examples --source_dir /arrow/cpp/tools --quiet --- /arrow/cpp/src/arrow/acero/aggregate_internal.h +++ /arrow/cpp/src/arrow/acero/aggregate_internal.h (after clang format) @@ -52,8 +52,8 @@ // segment-keys is used to refine the partitioning. However, segment-keys are different in // that they partition only consecutive rows into a single group. Such a partition of // consecutive rows is called a segment group. For example, consider a column X with -// values [A, A, B, A] at row-indices [0, 1, 2, 3]. A regular group-by aggregation with keys -// [X] yields a row-index partitioning [[0, 1, 3], [2]] whereas a segmented-group-by +// values [A, A, B, A] at row-indices [0, 1, 2, 3]. A regular group-by aggregation with +// keys [X] yields a row-index partitioning [[0, 1, 3], [2]] whereas a segmented-group-by // aggregation with segment-keys [X] yields [[0, 1], [2], [3]]. // // The implementation first segments the input using the segment-keys, then groups by the /arrow/cpp/src/arrow/acero/aggregate_internal.h had clang-format style issues ``` ### What changes are included in this PR? I've fixed it by `ninja format`. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- cpp/src/arrow/acero/aggregate_internal.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/acero/aggregate_internal.h b/cpp/src/arrow/acero/aggregate_internal.h index 744acb124505a..72537a7f7e3fe 100644 --- a/cpp/src/arrow/acero/aggregate_internal.h +++ b/cpp/src/arrow/acero/aggregate_internal.h @@ -52,8 +52,8 @@ // segment-keys is used to refine the partitioning. However, segment-keys are different in // that they partition only consecutive rows into a single group. Such a partition of // consecutive rows is called a segment group. For example, consider a column X with -// values [A, A, B, A] at row-indices [0, 1, 2, 3]. A regular group-by aggregation with keys -// [X] yields a row-index partitioning [[0, 1, 3], [2]] whereas a segmented-group-by +// values [A, A, B, A] at row-indices [0, 1, 2, 3]. A regular group-by aggregation with +// keys [X] yields a row-index partitioning [[0, 1, 3], [2]] whereas a segmented-group-by // aggregation with segment-keys [X] yields [[0, 1], [2], [3]]. // // The implementation first segments the input using the segment-keys, then groups by the From e51ea4014fe3d3b97c07e697a30aac5f396c187a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 8 Aug 2023 09:33:21 +0900 Subject: [PATCH 111/749] MINOR: [C#] Bump xunit from 2.4.0 to 2.5.0 in /csharp (#37036) Bumps [xunit](https://github.com/xunit/xunit) from 2.4.0 to 2.5.0.
Commits
  • 12ec6b8 Bump up to v2.5.1-pre
  • 92f31c4 v2.5.0
  • 188216b Ensure both failed tests and error cause error return codes (and cancel for s...
  • c973179 Latest analyzers
  • f93d52d Propagate StopOnFail from config file => execution options
  • 5e70f54 #2737: Restore XunitException constructor (breaking change) (v2)
  • 028866f Add PackageDownload for Microsoft.NETFramework.ReferenceAssemblies.net452 to ...
  • aeba1bb Remove UAP/UWP support
  • 5308021 #1857: Add overload to Assert.Contains for ConcurrentDictionary (v2)
  • 5fbaa0a Incorrect type restriction on obsoleted Assert.ThrowsAny<T> for Func<Task> an...
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=xunit&package-manager=nuget&previous-version=2.4.0&new-version=2.5.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- .../Apache.Arrow.Compression.Tests.csproj | 2 +- .../Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj | 2 +- csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj index 145865a2df46a..b0a844972f33d 100644 --- a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj +++ b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj @@ -8,7 +8,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj index 26ab18ca7d6ae..81848a735f559 100644 --- a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj @@ -7,7 +7,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj index 981c359cab174..d77863dad9b71 100644 --- a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj +++ b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj @@ -11,7 +11,7 @@ - + all runtime; build; native; contentfiles; analyzers From 55dcd65582c29cca6951790f29f5905f3b6ec3e2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 8 Aug 2023 09:34:09 +0900 Subject: [PATCH 112/749] MINOR: [JS] Bump semver from 5.7.1 to 5.7.2 in /js (#36603) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [semver](https://github.com/npm/node-semver) from 5.7.1 to 5.7.2.
Release notes

Sourced from semver's releases.

v5.7.2

5.7.2 (2023-07-10)

Bug Fixes

Changelog

Sourced from semver's changelog.

5.7.2 (2023-07-10)

Bug Fixes

5.7

  • Add minVersion method

5.6

  • Move boolean loose param to an options object, with backwards-compatibility protection.
  • Add ability to opt out of special prerelease version handling with the includePrerelease option flag.

5.5

  • Add version coercion capabilities

5.4

  • Add intersection checking

5.3

  • Add minSatisfying method

5.2

  • Add prerelease(v) that returns prerelease components

5.1

  • Add Backus-Naur for ranges
  • Remove excessively cute inspection methods

5.0

  • Remove AMD/Browserified build artifacts
  • Fix ltr and gtr when using the * range
  • Fix for range * with a prerelease identifier
Commits
Maintainer changes

This version was pushed to npm by lukekarrys, a new releaser for semver since your current version.


[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=semver&package-manager=npm_and_yarn&previous-version=5.7.1&new-version=5.7.2)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself) You can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/apache/arrow/network/alerts).
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- js/yarn.lock | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/js/yarn.lock b/js/yarn.lock index 0343bc56f8e3f..bda4faa3b17cb 100644 --- a/js/yarn.lock +++ b/js/yarn.lock @@ -8079,9 +8079,9 @@ semver-greatest-satisfied-range@^1.1.0: sver-compat "^1.5.0" "semver@2 || 3 || 4 || 5", semver@^5.6.0: - version "5.7.1" - resolved "https://registry.yarnpkg.com/semver/-/semver-5.7.1.tgz#a954f931aeba508d307bbf069eff0c01c96116f7" - integrity sha512-sauaDf/PZdVgrLTNYHRtpXa1iRiKcaebiKQ1BJdpQlWH2lCvexQdX55snPFyK7QzpudqbCI0qXFfOasHdyNDGQ== + version "5.7.2" + resolved "https://registry.yarnpkg.com/semver/-/semver-5.7.2.tgz#48d55db737c3287cd4835e17fa13feace1c41ef8" + integrity sha512-cBznnQ9KjJqU67B52RMC65CMarK2600WFnbkcaiwWq3xy/5haFJlshgnpjovMVJ+Hff49d8GEn0b87C5pDQ10g== semver@7.3.4: version "7.3.4" @@ -8091,16 +8091,16 @@ semver@7.3.4: lru-cache "^6.0.0" semver@7.x, semver@^7.0.0, semver@^7.1.1, semver@^7.3.4, semver@^7.3.5, semver@^7.3.7, semver@^7.3.8: - version "7.5.1" - resolved "https://registry.yarnpkg.com/semver/-/semver-7.5.1.tgz#c90c4d631cf74720e46b21c1d37ea07edfab91ec" - integrity sha512-Wvss5ivl8TMRZXXESstBA4uR5iXgEN/VC5/sOcuXdVLzcdkz4HWetIoRfG5gb5X+ij/G9rw9YoGn3QoQ8OCSpw== + version "7.5.4" + resolved "https://registry.yarnpkg.com/semver/-/semver-7.5.4.tgz#483986ec4ed38e1c6c48c34894a9182dbff68a6e" + integrity sha512-1bCSESV6Pv+i21Hvpxp3Dx+pSD8lIPt8uVjRrxAUt/nbswYc+tK6Y2btiULjd4+fnq15PX+nqQDC7Oft7WkwcA== dependencies: lru-cache "^6.0.0" semver@^6.0.0, semver@^6.3.0: - version "6.3.0" - resolved "https://registry.yarnpkg.com/semver/-/semver-6.3.0.tgz#ee0a64c8af5e8ceea67687b133761e1becbd1d3d" - integrity sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw== + version "6.3.1" + resolved "https://registry.yarnpkg.com/semver/-/semver-6.3.1.tgz#556d2ef8689146e46dcea4bfdd095f3434dffcb4" + integrity sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA== serialize-javascript@^6.0.1: version "6.0.1" From 21e1e00ab89870e5010d4aaaf91e689821035e81 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 8 Aug 2023 14:02:35 +0900 Subject: [PATCH 113/749] MINOR: [C#] Bump Microsoft.NET.Test.Sdk from 15.8.0 to 17.7.0 in /csharp (#37038) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [Microsoft.NET.Test.Sdk](https://github.com/microsoft/vstest) from 15.8.0 to 17.7.0.
Release notes

Sourced from Microsoft.NET.Test.Sdk's releases.

v17.7.0

Issues Fixed

  • RTM build fix
  • [rel/17.7] Fix branding on 17.7-release #4615
  • Externals final 17.7 upgrade #4565
  • "Snap" main to rel/17.7 #4558
  • Disable pre-start of testhosts #4554
  • Downgrade Nuget.Frameworks to 6.5.0 #4512
  • Update Nuget.Frameworks #4500
  • Fix hangdump running into crashdump #4480
  • Fix no-suitable provider found #4474
  • Fix Newtonsoft versions in testhost.deps.json #4372
  • Bumped TP version to 17.7.0 (#4346)

Full Changelog: here

Drops

  • Microsoft.TestPlatform.ObjectModel : v17.7.0

v17.7.0-preview-23364-03

Issues Fixed

  • Trim away netframework targets in source-build #4357
  • Re-enable publishing nugets, don't continue on error #4356
  • Merged PR 30352: arcade into main
  • Merged PR 29591: Onboard to arcade

Full Changelog: here

Drops

v17.7.0-preview.23280.1

⚠️ This version of Microsoft.TestPlatform nuget was unlisted on Nuget.org, because it causes issues when running tests in AzureDevOps, when VSTest installer is used, and configured to pick up the latest preview version (#4544).

What is new since 17.6.0?

Version 17.7.0 was filled with moving our infrastructure to build via Arcade, which brought us a more unified build experience and better cooperation with other teams in dotnet organization.

This migration brought a major change to our versioning of preview packages, which will no longer follow the 17.6.0-preview-20230223-05 (<version>-preview<date>-<build>).

Fixes

... (truncated)

Changelog

Sourced from Microsoft.NET.Test.Sdk's changelog.

Release Notes

17.6.1 and newer

Please see release notes directly in the release page: https://github.com/microsoft/vstest/releases

17.6.0

Issues Fixed

  • Add legacy feeds
  • [rel/17.6] Fix Newtonsoft versions in testhost.deps.json #4372
  • Revert "Revert "Fix signature verification" (#4333" #4345
  • Revert "Fix signature verification" #4333
  • Fix signature verification #4331
  • Pre-start testhosts #3666
  • Add dotnet vstest deprecation message #4297
  • Catch unhandled exception and avoid crash on test host exit #4291
  • Remove chutzpah #4249
  • Fix string conversion of Microsoft.TestPlatform.Extensions.TrxLogger.ObjectMode.TestOutcome #4243
  • Fix potential trx logger NRE #4240
  • handle object disposed exception #4221
  • Added support for checking testhost compatibility with test sessions #4199

See full log here

Artifacts

  • TestPlatform vsix: 17.6.0
  • Microsoft.TestPlatform.ObjectModel : 17.6.0

17.3.3

Issues Fixed

  • [rel/17.3] Update Newtonsoft.Json to 13.0.1 #4299

See full log here

Drops

  • TestPlatform vsix: 17.3.3
  • Microsoft.TestPlatform.ObjectModel : 17.3.3

17.2.1

Issues Fixed

  • [rel/17.2] Update Newtonsoft.Json to 13.0.1 #4310

See full log here

... (truncated)

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=Microsoft.NET.Test.Sdk&package-manager=nuget&previous-version=15.8.0&new-version=17.7.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- .../Apache.Arrow.Compression.Tests.csproj | 2 +- .../Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj | 2 +- csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj index b0a844972f33d..d0294ac3b44a5 100644 --- a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj +++ b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj @@ -7,7 +7,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj index 81848a735f559..ebb622c9334fe 100644 --- a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj @@ -6,7 +6,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj index d77863dad9b71..a66f6fd6e305a 100644 --- a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj +++ b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj @@ -10,7 +10,7 @@ - + all From c6c21c0056675b5137fd902022c24980fac726b4 Mon Sep 17 00:00:00 2001 From: David Li Date: Tue, 8 Aug 2023 09:24:45 -0400 Subject: [PATCH 114/749] MINOR: [Go] Add gRPC status details to sample Flight SQL server (#37026) ### Rationale for this change Needed to test apache/arrow-adbc#963. ### What changes are included in this PR? Have the SQLite Flight SQL server sample emit a gRPC status detail. ### Are these changes tested? No. ### Are there any user-facing changes? No. Authored-by: David Li Signed-off-by: David Li --- .../flight/flightsql/example/sql_batch_reader.go | 12 +++++++++++- go/arrow/flight/flightsql/example/sqlite_server.go | 5 +++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/go/arrow/flight/flightsql/example/sql_batch_reader.go b/go/arrow/flight/flightsql/example/sql_batch_reader.go index ae70406693e25..a8735de48a5e6 100644 --- a/go/arrow/flight/flightsql/example/sql_batch_reader.go +++ b/go/arrow/flight/flightsql/example/sql_batch_reader.go @@ -31,6 +31,9 @@ import ( "github.com/apache/arrow/go/v13/arrow/flight/flightsql" "github.com/apache/arrow/go/v13/arrow/internal/debug" "github.com/apache/arrow/go/v13/arrow/memory" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" + "google.golang.org/protobuf/types/known/wrapperspb" ) func getArrowTypeFromString(dbtype string) arrow.DataType { @@ -257,12 +260,19 @@ func (r *SqlBatchReader) Next() bool { rows := 0 for rows < maxBatchSize && r.rows.Next() { if err := r.rows.Scan(r.rowdest...); err != nil { - r.err = err + // Not really useful except for testing Flight SQL clients + detail := wrapperspb.StringValue{Value: r.schema.String()} + if st, sterr := status.New(codes.Unknown, err.Error()).WithDetails(&detail); sterr != nil { + r.err = err + } else { + r.err = st.Err() + } return false } for i, v := range r.rowdest { fb := r.bldr.Field(i) + switch v := v.(type) { case *uint8: fb.(*array.Uint8Builder).Append(*v) diff --git a/go/arrow/flight/flightsql/example/sqlite_server.go b/go/arrow/flight/flightsql/example/sqlite_server.go index 34093079d72a4..2f8ff99b3155f 100644 --- a/go/arrow/flight/flightsql/example/sqlite_server.go +++ b/go/arrow/flight/flightsql/example/sqlite_server.go @@ -52,7 +52,9 @@ import ( "github.com/apache/arrow/go/v13/arrow/flight/flightsql/schema_ref" "github.com/apache/arrow/go/v13/arrow/memory" "github.com/apache/arrow/go/v13/arrow/scalar" + "google.golang.org/grpc" "google.golang.org/grpc/codes" + "google.golang.org/grpc/metadata" "google.golang.org/grpc/status" _ "modernc.org/sqlite" ) @@ -462,6 +464,9 @@ type dbQueryCtx interface { func doGetQuery(ctx context.Context, mem memory.Allocator, db dbQueryCtx, query string, schema *arrow.Schema, args ...interface{}) (*arrow.Schema, <-chan flight.StreamChunk, error) { rows, err := db.QueryContext(ctx, query, args...) if err != nil { + // Not really useful except for testing Flight SQL clients + trailers := metadata.Pairs("afsql-sqlite-query", query) + grpc.SetTrailer(ctx, trailers) return nil, nil, err } From 59f30f089879196910c4eca0f0530ec1d039cc71 Mon Sep 17 00:00:00 2001 From: Ben Harkins <60872452+benibus@users.noreply.github.com> Date: Tue, 8 Aug 2023 13:40:08 -0400 Subject: [PATCH 115/749] GH-36892: [C++] Fix performance regressions in `FieldPath::Get` (#37032) ### Rationale for this change https://github.com/apache/arrow/pull/35197 appears to have introduced significant performance regressions in `FieldPath::Get` - indicated [here](https://conbench.ursa.dev/compare/runs/9cf73ac83f0a44179e6538b2c1c7babd...3d76cb5ffb8849bf8c3ea9b32d08b3b7/), in a benchmark that uses a wide (10K column) dataframe. ### What changes are included in this PR? - Adds basic benchmarks for `FieldPath::Get` across various input types, as they didn't previously exist - Addresses several performance issues. These came in the form of extremely high upfront costs for the `RecordBatch` and `ArrayData` overloads specifically - Some minor refactoring of `NestedSelector` ### Are these changes tested? Yes (covered by existing tests) ### Are there any user-facing changes? No * Closes: #36892 Lead-authored-by: benibus Co-authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/arrow/type.cc | 102 +++++++++++++++++--------- cpp/src/arrow/type_benchmark.cc | 125 ++++++++++++++++++++++++++++++++ 2 files changed, 191 insertions(+), 36 deletions(-) diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 68dc2aabe96ad..9267f1e499720 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -1066,17 +1066,29 @@ std::string FieldPath::ToString() const { return repr; } -static Status NonStructError() { - return Status::NotImplemented("Get child data of non-struct array"); -} +struct NestedSelectorUtil { + static Status NonStructError() { + return Status::NotImplemented("Get child data of non-struct array"); + } + + template + static const DataType* GetType(const T& input) { + if constexpr (std::is_same_v) { + return input.type.get(); + } else { + return input.type().get(); + } + } +}; -// Utility class for retrieving a child field/column from a top-level Field, Array, or -// ChunkedArray. The "root" value can either be a single parent or a vector of its -// children. +// Utility class for retrieving a child field/column from a top-level Field, Array, +// ArrayData, or ChunkedArray. The "root" value can either be a single parent or a vector +// of its children. template class NestedSelector { public: using ArrowType = T; + using Util = NestedSelectorUtil; explicit NestedSelector(const std::vector>& children) : parent_or_children_(&children) {} @@ -1095,7 +1107,18 @@ class NestedSelector { Result GetChild(int i) const { std::shared_ptr child; if (auto parent = get_parent()) { - ARROW_ASSIGN_OR_RAISE(child, GetChild(*parent, i, pool_)); + const DataType* type = Util::GetType(*parent); + // We avoid this check for schema fields since it's inconsequential (plus there are + // tests elsewhere that rely on it not happening) + if constexpr (!std::is_same_v) { + if (ARROW_PREDICT_FALSE(type->id() != Type::STRUCT)) { + return Util::NonStructError(); + } + } + // Bounds-check the index *once* using the parent's type + if (ARROW_PREDICT_TRUE(i >= 0 && i < type->num_fields())) { + ARROW_ASSIGN_OR_RAISE(child, GetChild(*parent, i, pool_)); + } } else if (auto children = get_children()) { if (ARROW_PREDICT_TRUE(i >= 0 && static_cast(i) < children->size())) { child = (*children)[i]; @@ -1129,10 +1152,10 @@ class NestedSelector { *os << "column types: { "; if (auto children = get_children()) { for (const auto& child : *children) { - *os << *child->type() << ", "; + *os << *Util::GetType(*child) << ", "; } } else if (auto parent = get_parent()) { - for (const auto& field : parent->type()->fields()) { + for (const auto& field : Util::GetType(*parent)->fields()) { *os << *field->type() << ", "; } } @@ -1155,21 +1178,33 @@ class NestedSelector { } static Result> GetChild(const Field& field, int i, MemoryPool*) { - if (ARROW_PREDICT_FALSE(i < 0 || i >= field.type()->num_fields())) { - return nullptr; - } return field.type()->field(i); } - static Result> GetChild(const Array& array, int i, - MemoryPool* pool) { - if (ARROW_PREDICT_FALSE(array.type_id() != Type::STRUCT)) { - return NonStructError(); - } - if (ARROW_PREDICT_FALSE(i < 0 || i >= array.num_fields())) { - return nullptr; + static Result> GetChild(const ArrayData& data, int i, + MemoryPool* pool) { + std::shared_ptr child_data; + if constexpr (IsFlattening) { + // First, convert to an Array so we can use StructArray::GetFlattenedField + auto array = MakeArray(data.Copy()); + ARROW_ASSIGN_OR_RAISE(auto child_array, GetChild(*array, i, pool)); + child_data = child_array->data(); + } else { + // We could achieve the same result by converting to an Array (via MakeArray), + // calling StructArray::field(i), and pulling out the new ArrayData. However, this + // process can be very expensive when there are many columns - so we just + // reimplement the functionality that we need + child_data = data.child_data[i]; + if (data.offset != 0 || data.child_data[i]->length != data.length) { + child_data = child_data->Slice(data.offset, data.length); + } } + return std::move(child_data); + } + + static Result> GetChild(const Array& array, int i, + MemoryPool* pool) { const auto& struct_array = checked_cast(array); if constexpr (IsFlattening) { return struct_array.GetFlattenedField(i, pool); @@ -1181,22 +1216,15 @@ class NestedSelector { static Result> GetChild(const ChunkedArray& chunked_array, int i, MemoryPool* pool) { const auto& type = *chunked_array.type(); - if (ARROW_PREDICT_FALSE(type.id() != Type::STRUCT)) { - return NonStructError(); - } - if (ARROW_PREDICT_FALSE(i < 0 || i >= type.num_fields())) { - return nullptr; - } ArrayVector chunks; chunks.reserve(chunked_array.num_chunks()); for (const auto& parent_chunk : chunked_array.chunks()) { ARROW_ASSIGN_OR_RAISE(auto chunk, GetChild(*parent_chunk, i, pool)); - if (!chunk) return nullptr; chunks.push_back(std::move(chunk)); } - return ChunkedArray::Make(std::move(chunks), type.field(i)->type()); + return std::make_shared(std::move(chunks), type.field(i)->type()); } std::shared_ptr owned_parent_; @@ -1289,7 +1317,11 @@ Result> FieldPath::GetAll(const Schema& schm, } Result> FieldPath::Get(const RecordBatch& batch) const { - return FieldPathGetImpl::Get(this, ZeroCopySelector(batch.columns())); + // Deliberately calling `column_data` here because `RecordBatch::columns` is nontrivial + ARROW_ASSIGN_OR_RAISE( + auto data, + FieldPathGetImpl::Get(this, ZeroCopySelector(batch.column_data()))); + return MakeArray(data); } Result> FieldPath::Get(const Table& table) const { @@ -1301,11 +1333,7 @@ Result> FieldPath::Get(const Array& array) const { } Result> FieldPath::Get(const ArrayData& data) const { - // We indirect from ArrayData to Array rather than vice-versa because, when selecting a - // nested column, the StructArray::field method does the work of adjusting the data's - // offset/length if necessary. - ARROW_ASSIGN_OR_RAISE(auto array, Get(*MakeArray(data.Copy()))); - return array->data(); + return FieldPathGetImpl::Get(this, ZeroCopySelector(data)); } Result> FieldPath::Get( @@ -1320,8 +1348,7 @@ Result> FieldPath::GetFlattened(const Array& array, Result> FieldPath::GetFlattened(const ArrayData& data, MemoryPool* pool) const { - ARROW_ASSIGN_OR_RAISE(auto array, GetFlattened(*MakeArray(data.Copy()), pool)); - return array->data(); + return FieldPathGetImpl::Get(this, FlatteningSelector(data, pool)); } Result> FieldPath::GetFlattened( @@ -1332,7 +1359,10 @@ Result> FieldPath::GetFlattened( Result> FieldPath::GetFlattened(const RecordBatch& batch, MemoryPool* pool) const { - return FieldPathGetImpl::Get(this, FlatteningSelector(batch.columns(), pool)); + ARROW_ASSIGN_OR_RAISE( + auto data, FieldPathGetImpl::Get( + this, FlatteningSelector(batch.column_data(), pool))); + return MakeArray(data); } Result> FieldPath::GetFlattened(const Table& table, diff --git a/cpp/src/arrow/type_benchmark.cc b/cpp/src/arrow/type_benchmark.cc index de90577ffdf64..17dccfcb33138 100644 --- a/cpp/src/arrow/type_benchmark.cc +++ b/cpp/src/arrow/type_benchmark.cc @@ -18,15 +18,19 @@ #include #include #include +#include #include #include #include #include "benchmark/benchmark.h" +#include "arrow/array.h" #include "arrow/result.h" #include "arrow/status.h" +#include "arrow/table.h" #include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" #include "arrow/type.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/macros.h" @@ -418,6 +422,120 @@ static void ErrorSchemeExceptionNoInline( state.SetItemsProcessed(state.iterations() * integers.size()); } +// ---------------------------------------------------------------------- +// FieldPath::Get benchmarks + +static std::shared_ptr GenerateTestSchema(int num_columns) { + FieldVector fields(num_columns); + for (int i = 0; i < num_columns; ++i) { + auto name = std::string("f") + std::to_string(i); + fields[i] = field(std::move(name), int64()); + } + return schema(std::move(fields)); +} + +static std::shared_ptr GenerateTestArray(int num_columns) { + constexpr int64_t kLength = 100; + + auto rand = random::RandomArrayGenerator(0xbeef); + auto schm = GenerateTestSchema(num_columns); + + ArrayVector columns(num_columns); + for (auto& column : columns) { + column = rand.Int64(kLength, 0, std::numeric_limits::max()); + } + + return *StructArray::Make(columns, schm->fields()); +} + +static std::shared_ptr ToBatch(const std::shared_ptr& array) { + return *RecordBatch::FromStructArray(array); +} + +static std::shared_ptr ToChunked(const std::shared_ptr& array, + double chunk_proportion = 1.0) { + auto struct_array = internal::checked_pointer_cast(array); + const auto num_rows = struct_array->length(); + const auto chunk_length = static_cast(std::ceil(num_rows * chunk_proportion)); + + ArrayVector chunks; + for (int64_t offset = 0; offset < num_rows;) { + int64_t slice_length = std::min(chunk_length, num_rows - offset); + chunks.push_back(*struct_array->SliceSafe(offset, slice_length)); + offset += slice_length; + } + + return *ChunkedArray::Make(std::move(chunks)); +} + +static std::shared_ptr
ToTable(const std::shared_ptr& array, + double chunk_proportion = 1.0) { + return *Table::FromChunkedStructArray(ToChunked(array, chunk_proportion)); +} + +template +static void BenchmarkFieldPathGet(benchmark::State& state, // NOLINT non-const reference + const T& input, int num_columns, + std::optional num_chunks = {}) { + // Reassigning a single FieldPath var within each iteration's scope seems to be costly + // enough to influence the timings, so we preprocess them. + std::vector paths(num_columns); + for (int i = 0; i < num_columns; ++i) { + paths[i] = {i}; + } + + for (auto _ : state) { + for (const auto& path : paths) { + benchmark::DoNotOptimize(path.Get(input).ValueOrDie()); + } + } + + state.SetItemsProcessed(state.iterations() * num_columns); + state.counters["num_columns"] = num_columns; + if (num_chunks.has_value()) { + state.counters["num_chunks"] = num_chunks.value(); + } +} + +static void FieldPathGetFromWideArray( + benchmark::State& state) { // NOLINT non-const reference + constexpr int kNumColumns = 10000; + auto array = GenerateTestArray(kNumColumns); + BenchmarkFieldPathGet(state, *array, kNumColumns); +} + +static void FieldPathGetFromWideArrayData( + benchmark::State& state) { // NOLINT non-const reference + constexpr int kNumColumns = 10000; + auto array = GenerateTestArray(kNumColumns); + BenchmarkFieldPathGet(state, *array->data(), kNumColumns); +} + +static void FieldPathGetFromWideBatch( + benchmark::State& state) { // NOLINT non-const reference + constexpr int kNumColumns = 10000; + auto batch = ToBatch(GenerateTestArray(kNumColumns)); + BenchmarkFieldPathGet(state, *batch, kNumColumns); +} + +static void FieldPathGetFromWideChunkedArray( + benchmark::State& state) { // NOLINT non-const reference + constexpr int kNumColumns = 10000; + // Percentage representing the size of each chunk relative to the total length (smaller + // proportion means more chunks) + const double chunk_proportion = state.range(0) / 100.0; + auto chunked_array = ToChunked(GenerateTestArray(kNumColumns), chunk_proportion); + BenchmarkFieldPathGet(state, *chunked_array, kNumColumns, chunked_array->num_chunks()); +} + +static void FieldPathGetFromWideTable( + benchmark::State& state) { // NOLINT non-const reference + constexpr int kNumColumns = 10000; + const double chunk_proportion = state.range(0) / 100.0; + auto table = ToTable(GenerateTestArray(kNumColumns), chunk_proportion); + BenchmarkFieldPathGet(state, *table, kNumColumns, table->column(0)->num_chunks()); +} + BENCHMARK(TypeEqualsSimple); BENCHMARK(TypeEqualsComplex); BENCHMARK(TypeEqualsWithMetadata); @@ -436,4 +554,11 @@ BENCHMARK(ErrorSchemeStatusNoInline); BENCHMARK(ErrorSchemeResultNoInline); BENCHMARK(ErrorSchemeExceptionNoInline); +BENCHMARK(FieldPathGetFromWideArray); +BENCHMARK(FieldPathGetFromWideArrayData); +BENCHMARK(FieldPathGetFromWideBatch); + +BENCHMARK(FieldPathGetFromWideChunkedArray)->Arg(2)->Arg(10)->Arg(25)->Arg(100); +BENCHMARK(FieldPathGetFromWideTable)->Arg(2)->Arg(10)->Arg(25)->Arg(100); + } // namespace arrow From b1e85a6d0cd7a57f93b97d74bb13e89517e3d92e Mon Sep 17 00:00:00 2001 From: Li Jin Date: Tue, 8 Aug 2023 18:25:06 -0400 Subject: [PATCH 116/749] GH-36672: [Python][C++] Add support for vector function UDF (#36673) ### Rationale for this change In Arrow compute, there are four main types of functions: Scalar, Vector, ScalarAggregate and HashAggregate. Some of the previous work added support for Scalar, ScalarAggregate(https://github.com/apache/arrow/issues/35515) and HashAggregate(https://github.com/apache/arrow/issues/36252). I think it makes sense to add support for vector function as well to complete all non-decomposable UDF kernel support. Internally, we plan to extend Acero to implement a "SegmentVectorNode" which would use this API to invoke vector on a segment by segment basis, which will allow to use constant memory to compute things like "rank the value across all rows per segment using a python UDF". ### What changes are included in this PR? The change includes is very similar to the support for aggregate function, which includes code to register the vector UDF, and a kernel that invokes the vector UDF on given inputs. ### Are these changes tested? Yes. Added new test. ### Are there any user-facing changes? Yes. This adds an user-facing API to register the vector function. * Closes: #36672 Authored-by: Li Jin Signed-off-by: Li Jin --- python/pyarrow/_compute.pyx | 84 +++++++++++++++++++++++++- python/pyarrow/compute.py | 1 + python/pyarrow/includes/libarrow.pxd | 4 ++ python/pyarrow/src/arrow/python/udf.cc | 41 ++++++++----- python/pyarrow/src/arrow/python/udf.h | 5 ++ python/pyarrow/tests/test_udf.py | 70 +++++++++++++++++++++ 6 files changed, 188 insertions(+), 17 deletions(-) diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index ac7efeff41aba..bc3b9e8c558e0 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -1964,7 +1964,7 @@ class CumulativeOptions(_CumulativeOptions): Parameters ---------- start : Scalar, default None - Starting value for the cumulative operation. If none is given, + Starting value for the cumulative operation. If none is given, a default value depending on the operation and input type is used. skip_nulls : bool, default False When false, the first encountered null is propagated. @@ -2707,6 +2707,11 @@ cdef get_register_aggregate_function(): reg.register_func = RegisterAggregateFunction return reg +cdef get_register_vector_function(): + cdef RegisterUdf reg = RegisterUdf.__new__(RegisterUdf) + reg.register_func = RegisterVectorFunction + return reg + def register_scalar_function(func, function_name, function_doc, in_types, out_type, func_registry=None): @@ -2789,6 +2794,83 @@ def register_scalar_function(func, function_name, function_doc, in_types, out_ty out_type, func_registry) +def register_vector_function(func, function_name, function_doc, in_types, out_type, + func_registry=None): + """ + Register a user-defined vector function. + + This API is EXPERIMENTAL. + + A vector function is a function that executes vector + operations on arrays. Vector function is often used + when compute doesn't fit other more specific types of + functions (e.g., scalar and aggregate). + + Parameters + ---------- + func : callable + A callable implementing the user-defined function. + The first argument is the context argument of type + UdfContext. + Then, it must take arguments equal to the number of + in_types defined. It must return an Array or Scalar + matching the out_type. It must return a Scalar if + all arguments are scalar, else it must return an Array. + + To define a varargs function, pass a callable that takes + *args. The last in_type will be the type of all varargs + arguments. + function_name : str + Name of the function. There should only be one function + registered with this name in the function registry. + function_doc : dict + A dictionary object with keys "summary" (str), + and "description" (str). + in_types : Dict[str, DataType] + A dictionary mapping function argument names to + their respective DataType. + The argument names will be used to generate + documentation for the function. The number of + arguments specified here determines the function + arity. + out_type : DataType + Output type of the function. + func_registry : FunctionRegistry + Optional function registry to use instead of the default global one. + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> + >>> func_doc = {} + >>> func_doc["summary"] = "percent rank" + >>> func_doc["description"] = "compute percent rank" + >>> + >>> def list_flatten_udf(ctx, x): + ... return pc.list_flatten(x) + >>> + >>> func_name = "list_flatten_udf" + >>> in_types = {"array": pa.list_(pa.int64())} + >>> out_type = pa.int64() + >>> pc.register_vector_function(list_flatten_udf, func_name, func_doc, + ... in_types, out_type) + >>> + >>> answer = pc.call_function(func_name, [pa.array([[1, 2], [3, 4]])]) + >>> answer + + [ + 1, + 2, + 3, + 4 + ] + """ + return _register_user_defined_function(get_register_vector_function(), + func, function_name, function_doc, in_types, + out_type, func_registry) + + def register_aggregate_function(func, function_name, function_doc, in_types, out_type, func_registry=None): """ diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 0fefa18dd1136..7b8983cbb98d2 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -87,6 +87,7 @@ register_scalar_function, register_tabular_function, register_aggregate_function, + register_vector_function, UdfContext, # Expressions Expression, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index da46cdcb750d5..f4d6541fa724c 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2815,5 +2815,9 @@ cdef extern from "arrow/python/udf.h" namespace "arrow::py" nogil: function[CallbackUdf] wrapper, const CUdfOptions& options, CFunctionRegistry* registry) + CStatus RegisterVectorFunction(PyObject* function, + function[CallbackUdf] wrapper, const CUdfOptions& options, + CFunctionRegistry* registry) + CResult[shared_ptr[CRecordBatchReader]] CallTabularFunction( const c_string& func_name, const vector[CDatum]& args, CFunctionRegistry* registry) diff --git a/python/pyarrow/src/arrow/python/udf.cc b/python/pyarrow/src/arrow/python/udf.cc index 435c89f596d48..f7761a9277f0e 100644 --- a/python/pyarrow/src/arrow/python/udf.cc +++ b/python/pyarrow/src/arrow/python/udf.cc @@ -292,14 +292,14 @@ struct PythonUdfHashAggregatorImpl : public HashUdfAggregator { return out; } - Status Resize(KernelContext* ctx, int64_t new_num_groups) { + Status Resize(KernelContext* ctx, int64_t new_num_groups) override { // We only need to change num_groups in resize // similar to other hash aggregate kernels num_groups = new_num_groups; return Status::OK(); } - Status Consume(KernelContext* ctx, const ExecSpan& batch) { + Status Consume(KernelContext* ctx, const ExecSpan& batch) override { ARROW_ASSIGN_OR_RAISE( std::shared_ptr rb, batch.ToExecBatch().ToRecordBatch(input_schema, ctx->memory_pool())); @@ -316,7 +316,7 @@ struct PythonUdfHashAggregatorImpl : public HashUdfAggregator { return Status::OK(); } Status Merge(KernelContext* ctx, KernelState&& other_state, - const ArrayData& group_id_mapping) { + const ArrayData& group_id_mapping) override { // This is similar to GroupedListImpl auto& other = checked_cast(other_state); auto& other_values = other.values; @@ -336,7 +336,7 @@ struct PythonUdfHashAggregatorImpl : public HashUdfAggregator { return Status::OK(); } - Status Finalize(KernelContext* ctx, Datum* out) { + Status Finalize(KernelContext* ctx, Datum* out) override { // Exclude the last column which is the group id const int num_args = input_schema->num_fields() - 1; @@ -484,24 +484,25 @@ Status PythonUdfExec(compute::KernelContext* ctx, const compute::ExecSpan& batch return SafeCallIntoPython([&]() -> Status { return udf->Exec(ctx, batch, out); }); } -Status RegisterUdf(PyObject* user_function, compute::KernelInit kernel_init, - UdfWrapperCallback wrapper, const UdfOptions& options, +template +Status RegisterUdf(PyObject* function, compute::KernelInit kernel_init, + UdfWrapperCallback cb, const UdfOptions& options, compute::FunctionRegistry* registry) { - if (!PyCallable_Check(user_function)) { + if (!PyCallable_Check(function)) { return Status::TypeError("Expected a callable Python object."); } - auto scalar_func = std::make_shared( - options.func_name, options.arity, options.func_doc); - Py_INCREF(user_function); + auto scalar_func = + std::make_shared(options.func_name, options.arity, options.func_doc); + Py_INCREF(function); std::vector input_types; for (const auto& in_dtype : options.input_types) { input_types.emplace_back(in_dtype); } compute::OutputType output_type(options.output_type); auto udf_data = std::make_shared( - std::make_shared(user_function), wrapper, + std::make_shared(function), cb, TypeHolder::FromTypes(options.input_types), options.output_type); - compute::ScalarKernel kernel( + Kernel kernel( compute::KernelSignature::Make(std::move(input_types), std::move(output_type), options.arity.is_varargs), PythonUdfExec, kernel_init); @@ -522,9 +523,17 @@ Status RegisterUdf(PyObject* user_function, compute::KernelInit kernel_init, Status RegisterScalarFunction(PyObject* function, UdfWrapperCallback cb, const UdfOptions& options, compute::FunctionRegistry* registry) { - return RegisterUdf(function, - PythonUdfKernelInit{std::make_shared(function)}, cb, - options, registry); + return RegisterUdf( + function, PythonUdfKernelInit{std::make_shared(function)}, cb, + options, registry); +} + +Status RegisterVectorFunction(PyObject* function, UdfWrapperCallback cb, + const UdfOptions& options, + compute::FunctionRegistry* registry) { + return RegisterUdf( + function, PythonUdfKernelInit{std::make_shared(function)}, cb, + options, registry); } Status RegisterTabularFunction(PyObject* function, UdfWrapperCallback cb, @@ -536,7 +545,7 @@ Status RegisterTabularFunction(PyObject* function, UdfWrapperCallback cb, if (options.output_type->id() != Type::type::STRUCT) { return Status::Invalid("tabular function with non-struct output"); } - return RegisterUdf( + return RegisterUdf( function, PythonTableUdfKernelInit{std::make_shared(function), cb}, cb, options, registry); } diff --git a/python/pyarrow/src/arrow/python/udf.h b/python/pyarrow/src/arrow/python/udf.h index 682cbb2ffe8d5..d8c4e430e53d4 100644 --- a/python/pyarrow/src/arrow/python/udf.h +++ b/python/pyarrow/src/arrow/python/udf.h @@ -67,6 +67,11 @@ Status ARROW_PYTHON_EXPORT RegisterAggregateFunction( PyObject* user_function, UdfWrapperCallback wrapper, const UdfOptions& options, compute::FunctionRegistry* registry = NULLPTR); +/// \brief register a Vector user-defined-function from Python +Status ARROW_PYTHON_EXPORT RegisterVectorFunction( + PyObject* user_function, UdfWrapperCallback wrapper, const UdfOptions& options, + compute::FunctionRegistry* registry = NULLPTR); + Result> ARROW_PYTHON_EXPORT CallTabularFunction(const std::string& func_name, const std::vector& args, compute::FunctionRegistry* registry = NULLPTR); diff --git a/python/pyarrow/tests/test_udf.py b/python/pyarrow/tests/test_udf.py index 5631e19455c06..62d1eb5bafd4f 100644 --- a/python/pyarrow/tests/test_udf.py +++ b/python/pyarrow/tests/test_udf.py @@ -299,6 +299,44 @@ def raising_func(ctx): return raising_func, func_name +@pytest.fixture(scope="session") +def unary_vector_func_fixture(): + """ + Reigster a vector function + """ + def pct_rank(ctx, x): + # copy here to get around pandas 1.0 issue + return pa.array(x.to_pandas().copy().rank(pct=True)) + + func_name = "y=pct_rank(x)" + doc = empty_udf_doc + pc.register_vector_function(pct_rank, func_name, doc, { + 'x': pa.float64()}, pa.float64()) + + return pct_rank, func_name + + +@pytest.fixture(scope="session") +def struct_vector_func_fixture(): + """ + Reigster a vector function that returns a struct array + """ + def pivot(ctx, k, v, c): + df = pa.RecordBatch.from_arrays([k, v, c], names=['k', 'v', 'c']).to_pandas() + df_pivot = df.pivot(columns='c', values='v', index='k').reset_index() + return pa.RecordBatch.from_pandas(df_pivot).to_struct_array() + + func_name = "y=pivot(x)" + doc = empty_udf_doc + pc.register_vector_function( + pivot, func_name, doc, + {'k': pa.int64(), 'v': pa.float64(), 'c': pa.utf8()}, + pa.struct([('k', pa.int64()), ('v1', pa.float64()), ('v2', pa.float64())]) + ) + + return pivot, func_name + + def check_scalar_function(func_fixture, inputs, *, run_in_dataset=True, @@ -797,3 +835,35 @@ def test_hash_agg_random(sum_agg_func_fixture): [("value", "sum")]).rename_columns(['id', 'value_sum_udf']) assert result.sort_by('id') == expected.sort_by('id') + + +@pytest.mark.pandas +def test_vector_basic(unary_vector_func_fixture): + arr = pa.array([10.0, 20.0, 30.0, 40.0, 50.0], pa.float64()) + result = pc.call_function("y=pct_rank(x)", [arr]) + expected = unary_vector_func_fixture[0](None, arr) + assert result == expected + + +@pytest.mark.pandas +def test_vector_empty(unary_vector_func_fixture): + arr = pa.array([1], pa.float64()) + result = pc.call_function("y=pct_rank(x)", [arr]) + expected = unary_vector_func_fixture[0](None, arr) + assert result == expected + + +@pytest.mark.pandas +def test_vector_struct(struct_vector_func_fixture): + k = pa.array( + [1, 1, 2, 2], pa.int64() + ) + v = pa.array( + [1.0, 2.0, 3.0, 4.0], pa.float64() + ) + c = pa.array( + ['v1', 'v2', 'v1', 'v2'] + ) + result = pc.call_function("y=pivot(x)", [k, v, c]) + expected = struct_vector_func_fixture[0](None, k, v, c) + assert result == expected From 77d87a97275c075e05835d6ac08be1d1c65c5c26 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Wed, 9 Aug 2023 09:12:07 +0900 Subject: [PATCH 117/749] GH-37051: [Dev][JS] Add Dependabot configuration for npm (#37053) ### Rationale for this change We can add `MINOR: [JS] ` prefix to PRs from Dependabot automatically. ### What changes are included in this PR? Add a configuration for npm. ### Are these changes tested? No. I want to test this by merging this to main. ### Are there any user-facing changes? No. * Closes: #37051 Lead-authored-by: Sutou Kouhei Co-authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- .github/dependabot.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 538482f96c0ee..795e1fbba9216 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -23,6 +23,12 @@ updates: interval: "weekly" commit-message: prefix: "MINOR: [CI] " + - package-ecosystem: "npm" + directory: "/js/" + schedule: + interval: "monthly" + commit-message: + prefix: "MINOR: [JS] " - package-ecosystem: "nuget" directory: "/csharp/" schedule: From 56b6051c467447c3a6278150823b445e378192a6 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 9 Aug 2023 09:53:35 +0900 Subject: [PATCH 118/749] MINOR: [C#] Bump System.Runtime.CompilerServices.Unsafe from 4.5.3 to 4.7.1 in /csharp (#37074) Bumps [System.Runtime.CompilerServices.Unsafe](https://github.com/dotnet/corefx) from 4.5.3 to 4.7.1.
Release notes

Sourced from System.Runtime.CompilerServices.Unsafe's releases.

.NET Core 2.1.0 RC1

Repos

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=System.Runtime.CompilerServices.Unsafe&package-manager=nuget&previous-version=4.5.3&new-version=4.7.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- csharp/src/Apache.Arrow/Apache.Arrow.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csharp/src/Apache.Arrow/Apache.Arrow.csproj b/csharp/src/Apache.Arrow/Apache.Arrow.csproj index b05533c26b22c..1eec449077479 100644 --- a/csharp/src/Apache.Arrow/Apache.Arrow.csproj +++ b/csharp/src/Apache.Arrow/Apache.Arrow.csproj @@ -11,7 +11,7 @@ - + From af94a42df82f1411e44b9163c926c2f3a9615d6f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 9 Aug 2023 09:56:02 +0900 Subject: [PATCH 119/749] MINOR: [C#] Bump ZstdSharp.Port from 0.6.7 to 0.7.2 in /csharp (#37075) Bumps [ZstdSharp.Port](https://github.com/oleg-st/ZstdSharp) from 0.6.7 to 0.7.2.
Release notes

Sourced from ZstdSharp.Port's releases.

0.7.2

IL2CPP compatibility (Unity)

0.7.1

Ported zstd v1.5.5

0.7.0

Ported zstd v1.5.4 Improved decompression speed (~5-10%)

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=ZstdSharp.Port&package-manager=nuget&previous-version=0.6.7&new-version=0.7.2)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- .../Apache.Arrow.Compression/Apache.Arrow.Compression.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj b/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj index 7795d24778985..70e12835f1821 100644 --- a/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj +++ b/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj @@ -7,7 +7,7 @@ - + From f8a45482822ed75133d385ffa5e99e74daa5e2a5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 9 Aug 2023 09:57:36 +0900 Subject: [PATCH 120/749] MINOR: [C#] Bump Google.Protobuf from 3.19.3 to 3.24.0 in /csharp (#37077) Bumps [Google.Protobuf](https://github.com/protocolbuffers/protobuf) from 3.19.3 to 3.24.0.
Release notes

Sourced from Google.Protobuf's releases.

Protocol Buffers v3.20.3

Java

  • Refactoring java full runtime to reuse sub-message builders and prepare to migrate parsing logic from parse constructor to builder.
  • Move proto wireformat parsing functionality from the private "parsing constructor" to the Builder class.
  • Change the Lite runtime to prefer merging from the wireformat into mutable messages rather than building up a new immutable object before merging. This way results in fewer allocations and copy operations.
  • Make message-type extensions merge from wire-format instead of building up instances and merging afterwards. This has much better performance.
  • Fix TextFormat parser to build up recurring (but supposedly not repeated) sub-messages directly from text rather than building a new sub-message and merging the fully formed message into the existing field.
  • This release addresses a Security Advisory for Java users

Protocol Buffers v3.20.2

C++

Protocol Buffers v3.20.1

PHP

  • Fix building packaged PHP extension (#9727)
  • Fixed composer.json to only advertise compatibility with PHP 7.0+. (#9819)

Ruby

  • Disable the aarch64 build on macOS until it can be fixed. (#9816)

Other

  • Fix versioning issues in 3.20.0

Protocol Buffers v3.20.1-rc1

PHP

  • Fix building packaged PHP extension (#9727)

Other

  • Fix versioning issues in 3.20.0

Protocol Buffers v3.20.0

2022-03-25 version 3.20.0 (C++/Java/Python/PHP/Objective-C/C#/Ruby/JavaScript)

Ruby

  • Dropped Ruby 2.3 and 2.4 support for CI and releases. (#9311)
  • Added Ruby 3.1 support for CI and releases (#9566).
  • Message.decode/encode: Add recursion_limit option (#9218/#9486)
  • Allocate with xrealloc()/xfree() so message allocation is visible to the Ruby GC. In certain tests this leads to much lower memory usage due to more frequent GC runs (#9586).
  • Fix conversion of singleton classes in Ruby (#9342)

... (truncated)

Commits
  • 093e258 Updating version.json and repo version numbers to: 24.0
  • e48f6f7 Merge pull request #13430 from protocolbuffers/24.x-202308011841
  • e82fc05 Updating version.json to: 24.0-dev
  • 3487ae0 Updating version.json and repo version numbers to: 24.0-rc3
  • e916bf6 Merge pull request #13421 from zhangskz/update-24-x-dep
  • fce87eb Update protobuf repo's upb dependency to latest 24.x commit
  • 64d9df7 Merge pull request #13418 from protocolbuffers/bazel6
  • ad916a9 Merge pull request #13417 from mkruskal-google/bazel6
  • bc9002e bazel: Get rid of exec_tools. (#13401)
  • fff4905 Drop support for Bazel 5.
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=Google.Protobuf&package-manager=nuget&previous-version=3.19.3&new-version=3.24.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj index ed33d88861415..8d447e5babbc2 100644 --- a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj +++ b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj @@ -5,7 +5,7 @@ - + From f2b1c145dae63c1741c564b29ee29575c343ec0b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 9 Aug 2023 11:36:09 +0900 Subject: [PATCH 121/749] MINOR: [JS] Bump memfs from 3.5.3 to 4.2.1 in /js (#37078) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [memfs](https://github.com/streamich/memfs) from 3.5.3 to 4.2.1.
Release notes

Sourced from memfs's releases.

v4.2.1

4.2.1 (2023-08-07)

Bug Fixes

  • don't error when watched directory gets renamed (#939) (b431b08)

memfs v4

memfs v4 has been released, install 4.2.0 NPM package.

File System Access API

memfs added adapters for File System Access (FAS) API, which is a file system API available in browsers. FSA can write to a real user folder, which user picks; or without any permissions can write to a virtual file system, called OPFS (Origin Private File System). memfs implements adapters, which:

  • Construct a Node's fs-like API out of the FSA API.
    • In browser, support for synchronous fs methods is also available using a WebWorker.
  • Also, the other way around, memfs can provide FSA API on top of any fs-like file system.

Other notable changes

  • In-memory fs changes
    • A number of in-memory fs bugs were fixed.
    • Type interfaces of methods and options objects for in-memory fs were improved.
    • mkdirp and mkdripSync were removed, they were deprecated before. (You can pass the "recursive" flag to mkdir and mkdirSync, instead.)
    • Many re-usable Node.js fs utilities now live in the /src/node folder.
    • Promises API methods are now bound by default to their this object, just like in native fs module.
  • crudfs and casfs
    • crudfs implementation on top of Node fs was added.
    • crudfs implementation on top of File System Access API was added.
    • casfs implementation on top of crudfs was added.
  • print utility was added which allows to recursively print a directory tree to terminal.
  • snapshot utility was added which allows to create recursive binary snapshots of folders and then load them back into some folder.
  • The /demo folder now contains multiple Webpack demos.
  • Build changes and deprecations
    • TypeScript was upgraded from version 4 to 5.
    • tslib is now included as peer dependency, instead of TypeScript helpers being emitted into distributable.
    • TypeScript build target of the distributable is no es2017, instead of previously es5.
    • The library is now tested on Node v18+ versions.
    • BigInt shim is no longer shipped, memfs will rely on the BigInt provided by the JavaScript environment. You can shim that global, if necessary.
    • fs-monkey dependency was removed.
  • New features in this major release are marked as experimental, which means those APIs are in preview, their public interface may have breaking changes even across minor releases.
  • CircleCI was removed, now GitHub Actions power all builds and releases.
  • The next branch was tested, which deploys pre-releases.

Bug Fixes

... (truncated)

Changelog

Sourced from memfs's changelog.

4.2.1 (2023-08-07)

Bug Fixes

  • don't error when watched directory gets renamed (#939) (b431b08)

4.1.0 (2023-06-26)

Bug Fixes

  • 🐛 add support for unknown nodes (77786f1)
  • 🐛 allow readin into various kinds of buffers (361812d)
  • 🐛 allow readin into various kinds of buffers (e9c70e9)
  • 🐛 allow to seek in file (c04895b)
  • 🐛 allow to seek in file (b363689)
  • 🐛 correctly handle directory paths (ea909e8)
  • 🐛 do not allow empty children names (f014fd8)
  • 🐛 do not allow empty children names (43da1d6)
  • 🐛 handle root folder better (89bbffd)
  • 🐛 handle root folder better (76de780)
  • 🐛 improve file opening and closing logic (403c271)
  • 🐛 throw "ENOENT" and "ENOTDIR" when folder or file 404 (5de4faa)
  • 🐛 throw "ENOENT" and "ENOTDIR" when folder or file 404 (ddd5d56)

Features

  • 🎸 add .toTree() to Volume (2d5c4cb)
  • 🎸 add .truncate() method (038ab36)
  • 🎸 add .truncate() method (085335c)
  • 🎸 add ability to close files (0db56be)
  • 🎸 add ability to close files (d3828a8)
  • 🎸 add ability to create sub directories (8f15bd9)
  • 🎸 add ability to create sub directories (528c807)
  • 🎸 add ability to remove all files (76cabc7)
  • 🎸 add ability to remove all files (566e29b)
  • 🎸 add appendFileSync() method (57192fe)
  • 🎸 add appendFileSync() method (27411e4)
  • 🎸 add basenem() utility (8b27695)
  • 🎸 add basenem() utility (43354e5)
  • 🎸 add binary serialization to snapshots (c1cd615)
  • 🎸 add copyFile() method (de2bb0a)
  • 🎸 add copyFile() method (5e207c4)
  • 🎸 add copyFileSync() method (7e0137c)
  • 🎸 add copyFileSync() method (5fc1bac)
  • 🎸 add createSwapFile() method (dfdb908)
  • 🎸 add createSwapFile() method (b07ce79)
  • 🎸 add crudfs types (18c0658)

... (truncated)

Commits
  • a16834f chore(release): 4.2.1 [skip ci]
  • b431b08 fix: don't error when watched directory gets renamed (#939)
  • cd6c256 chore(deps): bump semver from 5.7.1 to 5.7.2 (#935)
  • 3356138 Merge pull request #903 from streamich/renovate/rimraf-5.x
  • af1e9d1 Release 4.2.0
  • 75e60a5 docs: remove v4 notice
  • 7556e5b chore(release): 4.1.0 [skip ci]
  • 51c4052 chore(deps): update dependency rimraf to v5
  • 32dd55a Merge pull request #926 from streamich/next
  • 491272b chore: 🤖 bump compile target to es2017
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=memfs&package-manager=npm_and_yarn&previous-version=3.5.3&new-version=4.2.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- js/package.json | 2 +- js/yarn.lock | 39 +++++++++++++++++++++++++++++---------- 2 files changed, 30 insertions(+), 11 deletions(-) diff --git a/js/package.json b/js/package.json index bcb43e8bd0e31..510d1afd9a659 100644 --- a/js/package.json +++ b/js/package.json @@ -100,7 +100,7 @@ "jest": "29.5.0", "jest-silent-reporter": "0.5.0", "lerna": "7.0.0", - "memfs": "3.5.3", + "memfs": "4.2.1", "mkdirp": "3.0.1", "multistream": "4.1.0", "randomatic": "3.1.1", diff --git a/js/yarn.lock b/js/yarn.lock index bda4faa3b17cb..eae220fc84f72 100644 --- a/js/yarn.lock +++ b/js/yarn.lock @@ -2030,6 +2030,11 @@ arg@^4.1.0: resolved "https://registry.yarnpkg.com/arg/-/arg-4.1.3.tgz#269fc7ad5b8e42cb63c896d5666017261c144089" integrity sha512-58S9QDqG0Xx27YwPSt9fJxivjYl432YCwfDMfZ+71RAqUrZef7LrKQZ3LHLOwCS4FLNBplP533Zx895SeOCHvA== +arg@^5.0.2: + version "5.0.2" + resolved "https://registry.yarnpkg.com/arg/-/arg-5.0.2.tgz#c81433cc427c92c4dcf4865142dbca6f15acd59c" + integrity sha512-PYjyFOLKQ9y57JvQ6QLo8dAgNqswh8M1RMJYdQduT6xbWSgK36P/Z/v+p888pM69jMMfS8Xd8F6I1kQ/I9HUGg== + argparse@^1.0.7: version "1.0.10" resolved "https://registry.yarnpkg.com/argparse/-/argparse-1.0.10.tgz#bcd6791ea5ae09725e17e5ad988134cd40b3d911" @@ -4162,11 +4167,6 @@ fs-mkdirp-stream@^1.0.0: graceful-fs "^4.1.11" through2 "^2.0.3" -fs-monkey@^1.0.4: - version "1.0.4" - resolved "https://registry.yarnpkg.com/fs-monkey/-/fs-monkey-1.0.4.tgz#ee8c1b53d3fe8bb7e5d2c5c5dfc0168afdd2f747" - integrity sha512-INM/fWAxMICjttnD0DX1rBvinKskj5G1w+oy/pnm9u/tSlnBrzFonJMcalKJ30P8RRsPzKcCG7Q8l0jx5Fh9YQ== - fs.realpath@^1.0.0: version "1.0.0" resolved "https://registry.yarnpkg.com/fs.realpath/-/fs.realpath-1.0.0.tgz#1504ad2523158caa40db4a2787cb01411994ea4f" @@ -4827,6 +4827,11 @@ humanize-ms@^1.2.1: dependencies: ms "^2.0.0" +hyperdyperid@^1.2.0: + version "1.2.0" + resolved "https://registry.yarnpkg.com/hyperdyperid/-/hyperdyperid-1.2.0.tgz#59668d323ada92228d2a869d3e474d5a33b69e6b" + integrity sha512-Y93lCzHYgGWdrJ66yIktxiaGULYc6oGiABxhcO5AufBeOyoIdZF7bIfLaOrbM0iGIOXQQgxxRrFEnb+Y6w1n4A== + iconv-lite@^0.4.24: version "0.4.24" resolved "https://registry.yarnpkg.com/iconv-lite/-/iconv-lite-0.4.24.tgz#2022b4b25fbddc21d2f524974a474aafe733908b" @@ -5793,6 +5798,14 @@ json-bignum@^0.0.3: resolved "https://registry.yarnpkg.com/json-bignum/-/json-bignum-0.0.3.tgz#41163b50436c773d82424dbc20ed70db7604b8d7" integrity sha512-2WHyXj3OfHSgNyuzDbSxI1w2jgw5gkWSWhS7Qg4bWXx1nLk3jnbwfUeS0PSba3IzpTUWdHxBieELUzXRjQB2zg== +json-joy@^9.2.0: + version "9.5.1" + resolved "https://registry.yarnpkg.com/json-joy/-/json-joy-9.5.1.tgz#056683b4db4b0e279451a563a756b70b9fd97fa3" + integrity sha512-XMSpdxaiWUZlc+CAUbPS3G2MZbGxm6clFatqjta/DLrq5V4Y5JU4cx7Qvy7l+XTVPvmRWaYuzzAuCf9uUc40IA== + dependencies: + arg "^5.0.2" + hyperdyperid "^1.2.0" + json-parse-better-errors@^1.0.1: version "1.0.2" resolved "https://registry.yarnpkg.com/json-parse-better-errors/-/json-parse-better-errors-1.0.2.tgz#bb867cfb3450e69107c131d1c514bab3dc8bcaa9" @@ -6339,12 +6352,13 @@ math-random@^1.0.1: resolved "https://registry.yarnpkg.com/math-random/-/math-random-1.0.4.tgz#5dd6943c938548267016d4e34f057583080c514c" integrity sha512-rUxjysqif/BZQH2yhd5Aaq7vXMSx9NdEsQcyA07uEzIvxgI7zIr33gGsh+RU0/XjmQpCW7RsVof1vlkvQVCK5A== -memfs@3.5.3: - version "3.5.3" - resolved "https://registry.yarnpkg.com/memfs/-/memfs-3.5.3.tgz#d9b40fe4f8d5788c5f895bda804cd0d9eeee9f3b" - integrity sha512-UERzLsxzllchadvbPs5aolHh65ISpKpM+ccLbOJ8/vvpBKmAWf+la7dXFy7Mr0ySHbdHrFv5kGFCUHHe6GFEmw== +memfs@4.2.1: + version "4.2.1" + resolved "https://registry.yarnpkg.com/memfs/-/memfs-4.2.1.tgz#8c5a48707a460dde8e734b15e405e8377db2bec5" + integrity sha512-CINEB6cNAAhLUfRGrB4lj2Pj47ygerEmw3jxPb6R1gkD6Jfp484gJLteQ6MzqIjGWtFWuVzDl+KN7HiipMuKSw== dependencies: - fs-monkey "^1.0.4" + json-joy "^9.2.0" + thingies "^1.11.1" memoizee@0.4.X: version "0.4.15" @@ -8703,6 +8717,11 @@ textextensions@^3.2.0: resolved "https://registry.yarnpkg.com/textextensions/-/textextensions-3.3.0.tgz#03530d5287b86773c08b77458589148870cc71d3" integrity sha512-mk82dS8eRABNbeVJrEiN5/UMSCliINAuz8mkUwH4SwslkNP//gbEzlWNS5au0z5Dpx40SQxzqZevZkn+WYJ9Dw== +thingies@^1.11.1: + version "1.12.0" + resolved "https://registry.yarnpkg.com/thingies/-/thingies-1.12.0.tgz#a815c224482d607aa70f563d3cbb351a338e4710" + integrity sha512-AiGqfYC1jLmJagbzQGuoZRM48JPsr9yB734a7K6wzr34NMhjUPrWSQrkF7ZBybf3yCerCL2Gcr02kMv4NmaZfA== + through2-filter@^3.0.0: version "3.0.0" resolved "https://registry.yarnpkg.com/through2-filter/-/through2-filter-3.0.0.tgz#700e786df2367c2c88cd8aa5be4cf9c1e7831254" From b668595dda38beb7bfd5b5528c3200402590c1eb Mon Sep 17 00:00:00 2001 From: Joe Marshall Date: Wed, 9 Aug 2023 03:50:03 +0100 Subject: [PATCH 122/749] GH-35176: [C++] Add support for disabling threading for emscripten (#35672) As previously discussed in #35176 this is a patch that adds an option `ARROW_ENABLE_THREADING`. When it is turned off, arrow threadpool and serial executors don't spawn threads, and instead run tasks in the main thread when futures are waited for. It doesn't mess with threading in projects included as dependencies, e.g. multithreaded malloc implementations because if you're building for a non threaded environment, you can't use those anyway. Basically where this is at is that it runs the test suite okay, and I think should work well enough to be a backend for pandas on emscripten/pyodide. What this means is: 1) It is possible to use arrow in non-threaded emscripten/webassembly environments (with some build patches specific to emscripten which I'll put in once this is in) 2) Most of arrow just works, albeit slower in parts. Things that don't work and probably won't: 1) Server stuff that relies on threads. Not a massive problem I think because environments with threading restrictions are currently typically also restricted from making servers anyway (i.e. they are web browsers) 2) Anything that relies on actually doing two things at once (for obvious reasons) Things that don't work yet and could be fixed in future: 1) use of asynchronous file/network APIs in emscripten which would mean I/O could work efficiently in one thread. 2) asofjoin - right now the implementation relies on std::thread - it needs refactoring to work with threadpool like everything else in arrow, but I'm not sure I am expert enough in the codebase to do it well. * Closes: #35176 Lead-authored-by: Joe Marshall Co-authored-by: Sutou Kouhei Co-authored-by: Weston Pace Co-authored-by: Antoine Pitrou Signed-off-by: Sutou Kouhei --- ci/scripts/cpp_build.sh | 1 + cpp/cmake_modules/DefineOptions.cmake | 2 + cpp/src/arrow/acero/CMakeLists.txt | 21 +- cpp/src/arrow/acero/asof_join_node.cc | 5 + cpp/src/arrow/acero/bloom_filter.cc | 4 + cpp/src/arrow/acero/bloom_filter_test.cc | 7 +- cpp/src/arrow/acero/plan_test.cc | 4 + cpp/src/arrow/acero/task_util.cc | 5 + cpp/src/arrow/acero/task_util.h | 1 + cpp/src/arrow/acero/task_util_test.cc | 7 + cpp/src/arrow/dataset/dataset_writer_test.cc | 10 + cpp/src/arrow/engine/substrait/serde_test.cc | 11 + cpp/src/arrow/io/memory_test.cc | 4 + cpp/src/arrow/testing/gtest_util.cc | 89 ++++++ cpp/src/arrow/util/async_generator_test.cc | 11 + cpp/src/arrow/util/config.h.cmake | 1 + cpp/src/arrow/util/future.cc | 33 ++- cpp/src/arrow/util/future_test.cc | 7 +- cpp/src/arrow/util/iterator_test.cc | 2 + cpp/src/arrow/util/task_group.cc | 8 + cpp/src/arrow/util/thread_pool.cc | 257 +++++++++++++++++- cpp/src/arrow/util/thread_pool.h | 103 ++++++- cpp/src/arrow/util/thread_pool_test.cc | 41 +++ .../parquet/encryption/key_management_test.cc | 6 + dev/tasks/tasks.yml | 9 + 25 files changed, 629 insertions(+), 20 deletions(-) diff --git a/ci/scripts/cpp_build.sh b/ci/scripts/cpp_build.sh index e53b3fa460915..f71724cf61eb4 100755 --- a/ci/scripts/cpp_build.sh +++ b/ci/scripts/cpp_build.sh @@ -106,6 +106,7 @@ cmake \ -DARROW_C_FLAGS_RELWITHDEBINFO="${ARROW_C_FLAGS_RELWITHDEBINFO:-}" \ -DARROW_DATASET=${ARROW_DATASET:-ON} \ -DARROW_DEPENDENCY_SOURCE=${ARROW_DEPENDENCY_SOURCE:-AUTO} \ + -DARROW_ENABLE_THREADING=${ARROW_ENABLE_THREADING:-ON} \ -DARROW_ENABLE_TIMING_TESTS=${ARROW_ENABLE_TIMING_TESTS:-ON} \ -DARROW_EXTRA_ERROR_CONTEXT=${ARROW_EXTRA_ERROR_CONTEXT:-OFF} \ -DARROW_FILESYSTEM=${ARROW_FILESYSTEM:-ON} \ diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake index f32bb2bcf7290..29517567ce6e5 100644 --- a/cpp/cmake_modules/DefineOptions.cmake +++ b/cpp/cmake_modules/DefineOptions.cmake @@ -196,6 +196,8 @@ takes precedence over ccache if a storage backend is configured" ON) define_option(ARROW_WITH_MUSL "Whether the system libc is musl or not" OFF) + define_option(ARROW_ENABLE_THREADING "Enable threading in Arrow core" ON) + #---------------------------------------------------------------------- set_option_category("Test and benchmark") diff --git a/cpp/src/arrow/acero/CMakeLists.txt b/cpp/src/arrow/acero/CMakeLists.txt index c2c91db58d38a..44fbb26f0814d 100644 --- a/cpp/src/arrow/acero/CMakeLists.txt +++ b/cpp/src/arrow/acero/CMakeLists.txt @@ -173,7 +173,14 @@ add_arrow_acero_test(hash_join_node_test SOURCES hash_join_node_test.cc bloom_filter_test.cc) add_arrow_acero_test(pivot_longer_node_test SOURCES pivot_longer_node_test.cc test_nodes.cc) -add_arrow_acero_test(asof_join_node_test SOURCES asof_join_node_test.cc test_nodes.cc) + +# asof_join_node uses std::thread internally +# and doesn't use ThreadPool so it will +# be broken if threading is turned off +if(ARROW_ENABLE_THREADING) + add_arrow_acero_test(asof_join_node_test SOURCES asof_join_node_test.cc test_nodes.cc) +endif() + add_arrow_acero_test(tpch_node_test SOURCES tpch_node_test.cc) add_arrow_acero_test(union_node_test SOURCES union_node_test.cc) add_arrow_acero_test(aggregate_node_test SOURCES aggregate_node_test.cc) @@ -221,7 +228,9 @@ if(ARROW_BUILD_BENCHMARKS) add_arrow_acero_benchmark(project_benchmark SOURCES benchmark_util.cc project_benchmark.cc) - add_arrow_acero_benchmark(asof_join_benchmark SOURCES asof_join_benchmark.cc) + if(ARROW_ENABLE_THREADING) + add_arrow_acero_benchmark(asof_join_benchmark SOURCES asof_join_benchmark.cc) + endif() add_arrow_acero_benchmark(tpch_benchmark SOURCES tpch_benchmark.cc) @@ -244,7 +253,9 @@ if(ARROW_BUILD_BENCHMARKS) target_link_libraries(arrow-acero-expression-benchmark PUBLIC arrow_acero_static) target_link_libraries(arrow-acero-filter-benchmark PUBLIC arrow_acero_static) target_link_libraries(arrow-acero-project-benchmark PUBLIC arrow_acero_static) - target_link_libraries(arrow-acero-asof-join-benchmark PUBLIC arrow_acero_static) + if(ARROW_ENABLE_THREADING) + target_link_libraries(arrow-acero-asof-join-benchmark PUBLIC arrow_acero_static) + endif() target_link_libraries(arrow-acero-tpch-benchmark PUBLIC arrow_acero_static) if(ARROW_BUILD_OPENMP_BENCHMARKS) target_link_libraries(arrow-acero-hash-join-benchmark PUBLIC arrow_acero_static) @@ -253,7 +264,9 @@ if(ARROW_BUILD_BENCHMARKS) target_link_libraries(arrow-acero-expression-benchmark PUBLIC arrow_acero_shared) target_link_libraries(arrow-acero-filter-benchmark PUBLIC arrow_acero_shared) target_link_libraries(arrow-acero-project-benchmark PUBLIC arrow_acero_shared) - target_link_libraries(arrow-acero-asof-join-benchmark PUBLIC arrow_acero_shared) + if(ARROW_ENABLE_THREADING) + target_link_libraries(arrow-acero-asof-join-benchmark PUBLIC arrow_acero_shared) + endif() target_link_libraries(arrow-acero-tpch-benchmark PUBLIC arrow_acero_shared) if(ARROW_BUILD_OPENMP_BENCHMARKS) target_link_libraries(arrow-acero-hash-join-benchmark PUBLIC arrow_acero_shared) diff --git a/cpp/src/arrow/acero/asof_join_node.cc b/cpp/src/arrow/acero/asof_join_node.cc index b7f5d878e5881..23c07b8acb95f 100644 --- a/cpp/src/arrow/acero/asof_join_node.cc +++ b/cpp/src/arrow/acero/asof_join_node.cc @@ -49,6 +49,7 @@ #include "arrow/type_traits.h" #include "arrow/util/bit_util.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/config.h" #include "arrow/util/future.h" #include "arrow/util/string.h" @@ -1707,6 +1708,10 @@ class AsofJoinNode : public ExecNode { } Status StartProducing() override { +#ifndef ARROW_ENABLE_THREADING + return Status::NotImplemented("ASOF join requires threading enabled"); +#endif + ARROW_ASSIGN_OR_RAISE(process_task_, plan_->query_context()->BeginExternalTask( "AsofJoinNode::ProcessThread")); if (!process_task_.is_valid()) { diff --git a/cpp/src/arrow/acero/bloom_filter.cc b/cpp/src/arrow/acero/bloom_filter.cc index b9855ee506d27..db39ad1a83cab 100644 --- a/cpp/src/arrow/acero/bloom_filter.cc +++ b/cpp/src/arrow/acero/bloom_filter.cc @@ -20,6 +20,7 @@ #include "arrow/acero/util.h" // PREFETCH #include "arrow/util/bit_util.h" // Log2 #include "arrow/util/bitmap_ops.h" // CountSetBits +#include "arrow/util/config.h" namespace arrow { namespace acero { @@ -426,6 +427,9 @@ void BloomFilterBuilder_Parallel::CleanUp() { std::unique_ptr BloomFilterBuilder::Make( BloomFilterBuildStrategy strategy) { +#ifndef ARROW_ENABLE_THREADING + strategy = BloomFilterBuildStrategy::SINGLE_THREADED; +#endif switch (strategy) { case BloomFilterBuildStrategy::SINGLE_THREADED: { std::unique_ptr impl{new BloomFilterBuilder_SingleThreaded()}; diff --git a/cpp/src/arrow/acero/bloom_filter_test.cc b/cpp/src/arrow/acero/bloom_filter_test.cc index 95375e277e2b8..bad331cfd99d1 100644 --- a/cpp/src/arrow/acero/bloom_filter_test.cc +++ b/cpp/src/arrow/acero/bloom_filter_test.cc @@ -29,6 +29,8 @@ #include "arrow/acero/util.h" #include "arrow/compute/key_hash.h" #include "arrow/util/bitmap_ops.h" +#include "arrow/util/config.h" +#include "arrow/util/cpu_info.h" namespace arrow { @@ -468,7 +470,7 @@ TEST(BloomFilter, Basic) { std::vector strategies; strategies.push_back(BloomFilterBuildStrategy::SINGLE_THREADED); -#ifndef ARROW_VALGRIND +#if defined(ARROW_ENABLE_THREADING) && !defined(ARROW_VALGRIND) strategies.push_back(BloomFilterBuildStrategy::PARALLEL); #endif @@ -501,7 +503,10 @@ TEST(BloomFilter, Scaling) { num_build.push_back(4000000); std::vector strategies; +#ifdef ARROW_ENABLE_THREADING strategies.push_back(BloomFilterBuildStrategy::PARALLEL); +#endif + strategies.push_back(BloomFilterBuildStrategy::SINGLE_THREADED); for (const auto hardware_flags : HardwareFlagsForTesting()) { for (const auto& strategy : strategies) { diff --git a/cpp/src/arrow/acero/plan_test.cc b/cpp/src/arrow/acero/plan_test.cc index ff7d2d7eca1a6..03e10483ebf00 100644 --- a/cpp/src/arrow/acero/plan_test.cc +++ b/cpp/src/arrow/acero/plan_test.cc @@ -36,6 +36,7 @@ #include "arrow/testing/matchers.h" #include "arrow/testing/random.h" #include "arrow/util/async_generator.h" +#include "arrow/util/config.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" #include "arrow/util/thread_pool.h" @@ -1619,6 +1620,9 @@ TEST(ExecPlan, SourceEnforcesBatchLimit) { } TEST(ExecPlanExecution, SegmentedAggregationWithMultiThreading) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading enabled"; +#endif BatchesWithSchema data; data.batches = {ExecBatchFromJSON({int32()}, "[[1]]")}; data.schema = schema({field("i32", int32())}); diff --git a/cpp/src/arrow/acero/task_util.cc b/cpp/src/arrow/acero/task_util.cc index 8127902e69e94..4d8e9ecf76597 100644 --- a/cpp/src/arrow/acero/task_util.cc +++ b/cpp/src/arrow/acero/task_util.cc @@ -20,6 +20,7 @@ #include #include +#include "arrow/util/config.h" #include "arrow/util/logging.h" namespace arrow { @@ -316,7 +317,11 @@ Status TaskSchedulerImpl::StartScheduling(size_t thread_id, ScheduleImpl schedul int num_concurrent_tasks, bool use_sync_execution) { schedule_impl_ = std::move(schedule_impl); +#ifdef ARROW_ENABLE_THREADING use_sync_execution_ = use_sync_execution; +#else + use_sync_execution_ = true; +#endif num_concurrent_tasks_ = num_concurrent_tasks; num_tasks_to_schedule_.value += num_concurrent_tasks; return ScheduleMore(thread_id); diff --git a/cpp/src/arrow/acero/task_util.h b/cpp/src/arrow/acero/task_util.h index bc19396bd243b..fbd4af699d127 100644 --- a/cpp/src/arrow/acero/task_util.h +++ b/cpp/src/arrow/acero/task_util.h @@ -24,6 +24,7 @@ #include "arrow/acero/visibility.h" #include "arrow/status.h" +#include "arrow/util/config.h" #include "arrow/util/logging.h" namespace arrow { diff --git a/cpp/src/arrow/acero/task_util_test.cc b/cpp/src/arrow/acero/task_util_test.cc index dafb6b24b4cdd..d5196ad4e0a03 100644 --- a/cpp/src/arrow/acero/task_util_test.cc +++ b/cpp/src/arrow/acero/task_util_test.cc @@ -27,6 +27,7 @@ #include "arrow/acero/util.h" #include "arrow/testing/gtest_util.h" +#include "arrow/util/config.h" #include "arrow/util/thread_pool.h" namespace arrow { @@ -101,6 +102,9 @@ TaskScheduler::TaskGroupContinuationImpl MakeFinalContinuation( // concurrently. When all groups in that stage finish the next // stage is started. TEST(TaskScheduler, Stress) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif constexpr int kNumThreads = 8; constexpr int kNumGroups = 8; constexpr int kGroupsPerStage = 3; @@ -176,6 +180,9 @@ TEST(TaskScheduler, Stress) { // thread starts a task group while another thread is finishing // the last of its tasks. TEST(TaskScheduler, StressTwo) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif constexpr int kNumThreads = 16; constexpr int kNumGroups = 8; constexpr int kTasksPerGroup = 1; diff --git a/cpp/src/arrow/dataset/dataset_writer_test.cc b/cpp/src/arrow/dataset/dataset_writer_test.cc index d2480cd482fc6..c76e79d79b449 100644 --- a/cpp/src/arrow/dataset/dataset_writer_test.cc +++ b/cpp/src/arrow/dataset/dataset_writer_test.cc @@ -31,6 +31,7 @@ #include "arrow/table.h" #include "arrow/testing/future_util.h" #include "arrow/testing/gtest_util.h" +#include "arrow/util/config.h" #include "gtest/gtest.h" using namespace std::string_view_literals; // NOLINT @@ -380,6 +381,9 @@ TEST_F(DatasetWriterTestFixture, MinRowGroupBackpressure) { } TEST_F(DatasetWriterTestFixture, ConcurrentWritesSameFile) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Concurrent writes tests need threads"; +#endif // Use a gated filesystem to queue up many writes behind a file open to make sure the // file isn't opened multiple times. auto gated_fs = UseGatedFs(); @@ -394,6 +398,9 @@ TEST_F(DatasetWriterTestFixture, ConcurrentWritesSameFile) { } TEST_F(DatasetWriterTestFixture, ConcurrentWritesDifferentFiles) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Concurrent writes tests need threads"; +#endif // NBATCHES must be less than I/O executor concurrency to avoid deadlock / test failure constexpr int NBATCHES = 6; auto gated_fs = UseGatedFs(); @@ -412,6 +419,9 @@ TEST_F(DatasetWriterTestFixture, ConcurrentWritesDifferentFiles) { } TEST_F(DatasetWriterTestFixture, MaxOpenFiles) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Concurrent writes tests need threads"; +#endif auto gated_fs = UseGatedFs(); std::atomic paused = false; write_options_.max_open_files = 2; diff --git a/cpp/src/arrow/engine/substrait/serde_test.cc b/cpp/src/arrow/engine/substrait/serde_test.cc index 6342388744f39..efe1f702b4868 100644 --- a/cpp/src/arrow/engine/substrait/serde_test.cc +++ b/cpp/src/arrow/engine/substrait/serde_test.cc @@ -71,6 +71,7 @@ #include "arrow/type_fwd.h" #include "arrow/util/async_generator_fwd.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/config.h" #include "arrow/util/decimal.h" #include "arrow/util/future.h" #include "arrow/util/hash_util.h" @@ -4458,6 +4459,9 @@ TEST(Substrait, SetRelationBasic) { } TEST(Substrait, PlanWithAsOfJoinExtension) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "ASOF join requires threading"; +#endif // This demos an extension relation std::string substrait_json = R"({ "extensionUris": [], @@ -5477,6 +5481,10 @@ TEST(Substrait, MixedSort) { } TEST(Substrait, PlanWithExtension) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "ASOF join requires threading"; +#endif + // This demos an extension relation std::string substrait_json = R"({ "extensionUris": [], @@ -5665,6 +5673,9 @@ TEST(Substrait, PlanWithExtension) { } TEST(Substrait, AsOfJoinDefaultEmit) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "ASOF join requires threading"; +#endif std::string substrait_json = R"({ "extensionUris": [], "extensions": [], diff --git a/cpp/src/arrow/io/memory_test.cc b/cpp/src/arrow/io/memory_test.cc index 216d75f65ebb0..22f9a02fdbec8 100644 --- a/cpp/src/arrow/io/memory_test.cc +++ b/cpp/src/arrow/io/memory_test.cc @@ -42,6 +42,7 @@ #include "arrow/testing/util.h" #include "arrow/util/bit_util.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/config.h" #include "arrow/util/future.h" #include "arrow/util/iterator.h" #include "arrow/util/logging.h" @@ -918,6 +919,9 @@ TEST(CacheOptions, Basics) { } TEST(IOThreadPool, Capacity) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading enabled"; +#endif // Simple sanity check auto pool = internal::GetIOThreadPool(); int capacity = pool->GetCapacity(); diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc index 6fc709874e710..c6de6b02fc8ef 100644 --- a/cpp/src/arrow/testing/gtest_util.cc +++ b/cpp/src/arrow/testing/gtest_util.cc @@ -55,6 +55,7 @@ #include "arrow/table.h" #include "arrow/type.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/config.h" #include "arrow/util/future.h" #include "arrow/util/io_util.h" #include "arrow/util/logging.h" @@ -725,8 +726,25 @@ void TestInitialized(const ArrayData& array) { } void SleepFor(double seconds) { +#ifdef ARROW_ENABLE_THREADING std::this_thread::sleep_for( std::chrono::nanoseconds(static_cast(seconds * 1e9))); +#else + using Clock = std::chrono::steady_clock; + using DurationDouble = std::chrono::duration; + + auto secs_left = DurationDouble(seconds); + auto start_time = Clock::now(); + auto end_time = start_time + secs_left; + while (Clock::now() < end_time) { + bool run_task = arrow::internal::SerialExecutor::RunTasksOnAllExecutors(); + if (!run_task) { + // all executors are empty, just sleep for the rest of the time + std::this_thread::sleep_for(end_time - Clock::now()); + } + // run one task then check time + } +#endif } #ifdef _WIN32 @@ -1036,7 +1054,57 @@ class GatingTask::Impl : public std::enable_shared_from_this { return unlocked_future_; } + void WaitForEndOrUnlocked(std::chrono::time_point end_time, + arrow::internal::Executor* executor, Future<> future) { + if (unlocked_) { + num_finished_++; + future.MarkFinished(Status::OK()); + return; + } + if (std::chrono::steady_clock::now() > end_time) { + num_finished_++; + future.MarkFinished( + Status::Invalid("Task unlock never happened - if threads are disabled you " + "can't wait on gatedtask")); + return; + } + + SleepABit(); + auto spawn_status = executor->Spawn([this, end_time, executor, future]() { + WaitForEndOrUnlocked(end_time, executor, future); + }); + if (!spawn_status.ok()) { + status_ &= Status::Invalid("Couldn't spawn gating task unlock waiter"); + } + } + + Future<> RunTaskFuture() { + num_running_++; + // post the unlock check as a separate task + // otherwise we'll never let anything else run + // so nothing can unlock us + using Clock = std::chrono::steady_clock; + using DurationDouble = std::chrono::duration; + using DurationClock = std::chrono::steady_clock::duration; + + auto start_time = Clock::now(); + auto secs_left = DurationDouble(timeout_seconds_); + auto end_time = std::chrono::time_point_cast( + start_time + secs_left); + auto executor = arrow::internal::GetCpuThreadPool(); + auto future = Future<>::Make(); + auto spawn_status = executor->Spawn([this, end_time, executor, future]() { + WaitForEndOrUnlocked(end_time, executor, future); + }); + if (!spawn_status.ok()) { + status_ &= Status::Invalid("Couldn't spawn gating task unlock waiter"); + future.MarkFinished(Status::Invalid("")); + } + return future; + } + void RunTask() { +#ifdef ARROW_ENABLE_THREADING std::unique_lock lk(mx_); num_running_++; running_cv_.notify_all(); @@ -1048,9 +1116,16 @@ class GatingTask::Impl : public std::enable_shared_from_this { " seconds) waiting for the gating task to be unlocked"); } num_finished_++; +#else + // can't wait here for anything, so make a future to do the waiting + num_running_++; + auto future = RunTaskFuture(); + future.Wait(); +#endif } Status WaitForRunning(int count) { +#ifdef ARROW_ENABLE_THREADING std::unique_lock lk(mx_); if (running_cv_.wait_for( lk, std::chrono::nanoseconds(static_cast(timeout_seconds_ * 1e9)), @@ -1058,6 +1133,14 @@ class GatingTask::Impl : public std::enable_shared_from_this { return Status::OK(); } return Status::Invalid("Timed out waiting for tasks to launch"); +#else + BusyWait(timeout_seconds_, [this, count] { return num_running_ >= count; }); + if (num_running_ >= count) { + return Status::OK(); + } else { + return Status::Invalid("Timed out waiting for tasks to launch"); + } +#endif } Status Unlock() { @@ -1067,6 +1150,12 @@ class GatingTask::Impl : public std::enable_shared_from_this { unlocked_cv_.notify_all(); } unlocked_future_.MarkFinished(); +#ifndef ARROW_ENABLE_THREADING + while (num_finished_ != num_running_) { + arrow::internal::SerialExecutor::RunTasksOnAllExecutors(); + } +#endif + return status_; } diff --git a/cpp/src/arrow/util/async_generator_test.cc b/cpp/src/arrow/util/async_generator_test.cc index 37718f743fffc..7fb99f167c605 100644 --- a/cpp/src/arrow/util/async_generator_test.cc +++ b/cpp/src/arrow/util/async_generator_test.cc @@ -32,6 +32,7 @@ #include "arrow/type_fwd.h" #include "arrow/util/async_generator.h" #include "arrow/util/async_util.h" +#include "arrow/util/config.h" #include "arrow/util/test_common.h" #include "arrow/util/vector.h" @@ -994,6 +995,9 @@ TEST(TestAsyncUtil, GeneratorIterator) { } TEST(TestAsyncUtil, MakeTransferredGenerator) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif std::mutex mutex; std::condition_variable cv; std::atomic finished(false); @@ -1478,6 +1482,10 @@ TEST(TestAsyncUtil, ReadaheadMove) { } TEST(TestAsyncUtil, ReadaheadFailed) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + ASSERT_OK_AND_ASSIGN(auto thread_pool, internal::ThreadPool::Make(20)); std::atomic counter(0); auto gating_task = GatingTask::Make(); @@ -1512,6 +1520,9 @@ TEST(TestAsyncUtil, ReadaheadFailed) { } TEST(TestAsyncUtil, ReadaheadFailedWaitForInFlight) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif ASSERT_OK_AND_ASSIGN(auto thread_pool, internal::ThreadPool::Make(20)); // If a failure causes an early end then we should not emit that failure // until all in-flight futures have completed. This is to prevent tasks from diff --git a/cpp/src/arrow/util/config.h.cmake b/cpp/src/arrow/util/config.h.cmake index 1008b9c6b9a05..f7125cfd8a235 100644 --- a/cpp/src/arrow/util/config.h.cmake +++ b/cpp/src/arrow/util/config.h.cmake @@ -51,6 +51,7 @@ #cmakedefine ARROW_PARQUET #cmakedefine ARROW_SUBSTRAIT +#cmakedefine ARROW_ENABLE_THREADING #cmakedefine ARROW_GCS #cmakedefine ARROW_S3 #cmakedefine ARROW_USE_NATIVE_INT128 diff --git a/cpp/src/arrow/util/future.cc b/cpp/src/arrow/util/future.cc index c430ad1fc738f..a5426f949e721 100644 --- a/cpp/src/arrow/util/future.cc +++ b/cpp/src/arrow/util/future.cc @@ -25,6 +25,7 @@ #include #include "arrow/util/checked_cast.h" +#include "arrow/util/config.h" #include "arrow/util/logging.h" #include "arrow/util/thread_pool.h" #include "arrow/util/tracing_internal.h" @@ -89,7 +90,7 @@ class ConcreteFutureImpl : public FutureImpl { case ShouldSchedule::IfUnfinished: return !in_add_callback; case ShouldSchedule::IfDifferentExecutor: - return !callback_record.options.executor->OwnsThisThread(); + return !(callback_record.options.executor->IsCurrentExecutor()); default: DCHECK(false) << "Unrecognized ShouldSchedule option"; return false; @@ -149,17 +150,47 @@ class ConcreteFutureImpl : public FutureImpl { } void DoWait() { +#ifdef ARROW_ENABLE_THREADING std::unique_lock lock(mutex_); cv_.wait(lock, [this] { return IsFutureFinished(state_); }); +#else + auto last_processed_time = std::chrono::steady_clock::now(); + while (true) { + if (IsFutureFinished(state_)) { + return; + } + if (arrow::internal::SerialExecutor::RunTasksOnAllExecutors() == false) { + auto this_time = std::chrono::steady_clock::now(); + if (this_time - last_processed_time < std::chrono::seconds(10)) { + ARROW_LOG(WARNING) << "Waiting for future, but no executors have had any tasks " + "pending for last 10 seconds"; + last_processed_time = std::chrono::steady_clock::now(); + } + } + } +#endif } bool DoWait(double seconds) { +#ifdef ARROW_ENABLE_THREADING std::unique_lock lock(mutex_); cv_.wait_for(lock, std::chrono::duration(seconds), [this] { return IsFutureFinished(state_); }); return IsFutureFinished(state_); +#else + auto start = std::chrono::steady_clock::now(); + auto fsec = std::chrono::duration(seconds); + while (std::chrono::steady_clock::now() - start < fsec) { + // run one task then check time + if (IsFutureFinished(state_)) { + return true; + } + arrow::internal::SerialExecutor::RunTasksOnAllExecutors(); + } + return IsFutureFinished(state_); +#endif } std::mutex mutex_; diff --git a/cpp/src/arrow/util/future_test.cc b/cpp/src/arrow/util/future_test.cc index 689b7c3df62dd..87891e48efa5e 100644 --- a/cpp/src/arrow/util/future_test.cc +++ b/cpp/src/arrow/util/future_test.cc @@ -1073,7 +1073,7 @@ TEST_F(FutureSchedulingTest, ScheduleIfDifferentExecutor) { struct : internal::Executor { int GetCapacity() override { return pool_->GetCapacity(); } - bool OwnsThisThread() override { return pool_->OwnsThisThread(); } + bool IsCurrentExecutor() override { return pool_->IsCurrentExecutor(); } Status SpawnReal(internal::TaskHints hints, internal::FnOnce task, StopToken stop_token, StopCallback&& stop_callback) override { @@ -1100,8 +1100,7 @@ TEST_F(FutureSchedulingTest, ScheduleIfDifferentExecutor) { auto fut0_done = fut0.Then( [&] { // marked finished on main thread -> must be scheduled to executor - fut0_on_executor.store(executor.OwnsThisThread()); - + fut0_on_executor.store(executor.IsCurrentExecutor()); fut1.MarkFinished(); }, pass_err, options); @@ -1109,7 +1108,7 @@ TEST_F(FutureSchedulingTest, ScheduleIfDifferentExecutor) { auto fut1_done = fut1.Then( [&] { // marked finished on executor -> no need to schedule - fut1_on_executor.store(executor.OwnsThisThread()); + fut1_on_executor.store(executor.IsCurrentExecutor()); }, pass_err, options); diff --git a/cpp/src/arrow/util/iterator_test.cc b/cpp/src/arrow/util/iterator_test.cc index ab62fcb7034b7..ba21ddcced209 100644 --- a/cpp/src/arrow/util/iterator_test.cc +++ b/cpp/src/arrow/util/iterator_test.cc @@ -438,10 +438,12 @@ TEST(ReadaheadIterator, Trace) { ASSERT_EQ(values[i], TestInt()); } +#ifdef ARROW_ENABLE_THREADING // Values were all emitted from the same thread, and it's not this thread const auto& thread_ids = tracing->thread_ids(); ASSERT_EQ(thread_ids.size(), 1); ASSERT_NE(*thread_ids.begin(), std::this_thread::get_id()); +#endif } TEST(ReadaheadIterator, NextError) { diff --git a/cpp/src/arrow/util/task_group.cc b/cpp/src/arrow/util/task_group.cc index 932f642041d4e..0f08e7bde9cb6 100644 --- a/cpp/src/arrow/util/task_group.cc +++ b/cpp/src/arrow/util/task_group.cc @@ -24,6 +24,7 @@ #include #include "arrow/util/checked_cast.h" +#include "arrow/util/config.h" #include "arrow/util/logging.h" #include "arrow/util/thread_pool.h" @@ -128,12 +129,19 @@ class ThreadedTaskGroup : public TaskGroup { bool ok() const override { return ok_.load(); } Status Finish() override { +#ifdef ARROW_ENABLE_THREADING std::unique_lock lock(mutex_); if (!finished_) { cv_.wait(lock, [&]() { return nremaining_.load() == 0; }); // Current tasks may start other tasks, so only set this when done finished_ = true; } +#else + while (!finished_ && nremaining_.load() != 0) { + arrow::internal::SerialExecutor::RunTasksOnAllExecutors(); + } + finished_ = true; +#endif return status_; } diff --git a/cpp/src/arrow/util/thread_pool.cc b/cpp/src/arrow/util/thread_pool.cc index daffe8f077a2f..d82934c9bec01 100644 --- a/cpp/src/arrow/util/thread_pool.cc +++ b/cpp/src/arrow/util/thread_pool.cc @@ -27,6 +27,7 @@ #include #include "arrow/util/atfork_internal.h" +#include "arrow/util/config.h" #include "arrow/util/io_util.h" #include "arrow/util/logging.h" #include "arrow/util/mutex.h" @@ -60,11 +61,53 @@ struct SerialExecutor::State { std::thread::id current_thread; bool paused{false}; bool finished{false}; +#ifndef ARROW_ENABLE_THREADING + int max_tasks_running{1}; + int tasks_running{0}; +#endif +}; + +#ifndef ARROW_ENABLE_THREADING +// list of all SerialExecutor objects - as we need to run tasks from all pools at once in +// Run() +struct SerialExecutorGlobalState { + // a set containing all the executors that currently exist + std::unordered_set all_executors; + + // this is the executor which is currently running a task + SerialExecutor* current_executor = NULL; + + // in RunTasksOnAllExecutors we run tasks on executors in turn + // this is used to keep track of the last fired task so that it + // doesn't always run tasks on the first executor + // in case of nested calls to RunTasksOnAllExecutors + SerialExecutor* last_called_executor = NULL; }; -SerialExecutor::SerialExecutor() : state_(std::make_shared()) {} +static SerialExecutorGlobalState* GetSerialExecutorGlobalState() { + static SerialExecutorGlobalState state; + return &state; +} + +SerialExecutor* SerialExecutor::GetCurrentExecutor() { + return GetSerialExecutorGlobalState()->current_executor; +} + +bool SerialExecutor::IsCurrentExecutor() { return GetCurrentExecutor() == this; } + +#endif + +SerialExecutor::SerialExecutor() : state_(std::make_shared()) { +#ifndef ARROW_ENABLE_THREADING + GetSerialExecutorGlobalState()->all_executors.insert(this); + state_->max_tasks_running = 1; +#endif +} SerialExecutor::~SerialExecutor() { +#ifndef ARROW_ENABLE_THREADING + GetSerialExecutorGlobalState()->all_executors.erase(this); +#endif auto state = state_; std::unique_lock lk(state->mutex); if (!state->task_queue.empty()) { @@ -77,6 +120,12 @@ SerialExecutor::~SerialExecutor() { } } +int SerialExecutor::GetNumTasks() { + auto state = state_; + return static_cast(state_->task_queue.size()); +} + +#ifdef ARROW_ENABLE_THREADING Status SerialExecutor::SpawnReal(TaskHints hints, FnOnce task, StopToken stop_token, StopCallback&& stop_callback) { #ifdef ARROW_WITH_OPENTELEMETRY @@ -111,21 +160,55 @@ Status SerialExecutor::SpawnReal(TaskHints hints, FnOnce task, return Status::OK(); } -void SerialExecutor::Pause() { - // Same comment as SpawnReal above +void SerialExecutor::Finish() { auto state = state_; { std::lock_guard lk(state->mutex); - state->paused = true; + state->finished = true; } state->wait_for_tasks.notify_one(); } +#else // ARROW_ENABLE_THREADING +Status SerialExecutor::SpawnReal(TaskHints hints, FnOnce task, + StopToken stop_token, StopCallback&& stop_callback) { +#ifdef ARROW_WITH_OPENTELEMETRY + // Wrap the task to propagate a parent tracing span to it + // XXX should there be a generic utility in tracing_internal.h for this? + task = [func = std::move(task), + active_span = + ::arrow::internal::tracing::GetTracer()->GetCurrentSpan()]() mutable { + auto scope = ::arrow::internal::tracing::GetTracer()->WithActiveSpan(active_span); + std::move(func)(); + }; +#endif // ARROW_WITH_OPENTELEMETRY + + if (state_->finished) { + return Status::Invalid( + "Attempt to schedule a task on a serial executor that has already finished or " + "been abandoned"); + } + + state_->task_queue.push_back( + Task{std::move(task), std::move(stop_token), std::move(stop_callback)}); + + return Status::OK(); +} + void SerialExecutor::Finish() { + auto state = state_; + { state->finished = true; } + // empty any tasks from the loop on finish + RunLoop(); +} + +#endif // ARROW_ENABLE_THREADING +void SerialExecutor::Pause() { + // Same comment as SpawnReal above auto state = state_; { std::lock_guard lk(state->mutex); - state->finished = true; + state->paused = true; } state->wait_for_tasks.notify_one(); } @@ -147,6 +230,7 @@ bool SerialExecutor::OwnsThisThread() { std::lock_guard lk(state_->mutex); return std::this_thread::get_id() == state_->current_thread; } +#ifdef ARROW_ENABLE_THREADING void SerialExecutor::RunLoop() { // This is called from the SerialExecutor's main thread, so the @@ -183,6 +267,110 @@ void SerialExecutor::RunLoop() { } state_->current_thread = {}; } +#else // ARROW_ENABLE_THREADING +bool SerialExecutor::RunTasksOnAllExecutors() { + auto globalState = GetSerialExecutorGlobalState(); + // if the previously called executor was deleted, ignore last_called_executor + if (globalState->last_called_executor != NULL && + globalState->all_executors.count(globalState->last_called_executor) == 0) { + globalState->last_called_executor = NULL; + } + bool run_task = true; + bool keep_going = true; + while (keep_going) { + run_task = false; + keep_going = false; + for (auto it = globalState->all_executors.begin(); + it != globalState->all_executors.end(); ++it) { + if (globalState->last_called_executor != NULL) { + // always rerun loop if we have a last_called_executor, otherwise + // we may drop out before every executor has been checked + keep_going = true; + if (globalState->all_executors.count(globalState->last_called_executor) == 0 || + globalState->last_called_executor == *it) { + // found the last one (or it doesn't exist ih the set any more) + // now we can start running things + globalState->last_called_executor = NULL; + } + // skip until after we have seen the last executor we called + // so that we do things nicely in turn + continue; + } + auto exe = *it; + // don't make more reentrant calls inside an + // executor than the number of concurrent tasks set on a threadpool, or + // 1 in the case of a serialexecutor - + // this is because users will expect a serial executor not to be able to + // run the next task until the current one is finished (and a threadpool + // only to be able to run a certain number of tasks concurrently) + if (exe->state_->tasks_running >= exe->state_->max_tasks_running) { + continue; + } + if (exe->state_->paused == false && exe->state_->task_queue.empty() == false) { + SerialExecutor* old_exe = globalState->current_executor; + globalState->current_executor = exe; + Task task = std::move(exe->state_->task_queue.front()); + exe->state_->task_queue.pop_front(); + run_task = true; + exe->state_->tasks_running += 1; + if (!task.stop_token.IsStopRequested()) { + std::move(task.callable)(); + } else { + if (task.stop_callback) { + std::move(task.stop_callback)(task.stop_token.Poll()); + } + } + exe->state_->tasks_running -= 1; + globalState->current_executor = old_exe; + + globalState->last_called_executor = exe; + keep_going = false; + break; + } + } + } + return run_task; +} + +// run tasks in this thread and queue things from other executors if required +// (e.g. when a compute task depends on an IO request) +void SerialExecutor::RunLoop() { + auto globalState = GetSerialExecutorGlobalState(); + // If paused we break out immediately. If finished we only break out + // when all work is done. + while (!state_->paused && !(state_->finished && state_->task_queue.empty())) { + // first empty us until paused or empty + // if we're already running as many tasks as possible then + // we can't run any more until something else drops off the queue + if (state_->tasks_running <= state_->max_tasks_running) { + while (!state_->paused && !state_->task_queue.empty()) { + Task task = std::move(state_->task_queue.front()); + state_->task_queue.pop_front(); + auto last_executor = globalState->current_executor; + globalState->current_executor = this; + state_->tasks_running += 1; + if (!task.stop_token.IsStopRequested()) { + std::move(task.callable)(); + } else { + if (task.stop_callback) { + std::move(task.stop_callback)(task.stop_token.Poll()); + } + } + state_->tasks_running -= 1; + globalState->current_executor = last_executor; + } + if (state_->paused || (state_->finished && state_->task_queue.empty())) { + break; + } + } + // now wait for anything on other executors (unless we're finished in which case it + // will drop out of the outer loop + RunTasksOnAllExecutors(); + } +} +#endif // ARROW_ENABLE_THREADING + +#ifdef ARROW_ENABLE_THREADING struct ThreadPool::State { State() = default; @@ -532,6 +720,65 @@ int ThreadPool::DefaultCapacity() { return capacity; } +#else // ARROW_ENABLE_THREADING +ThreadPool::ThreadPool() { + // default to max 'concurrency' of 8 + // if threading is disabled + state_->max_tasks_running = 8; +} + +Status ThreadPool::Shutdown(bool wait) { + state_->finished = true; + if (wait) { + RunLoop(); + } else { + // clear any pending tasks so that we behave + // the same as threadpool on fast shutdown + state_->task_queue.clear(); + } + return Status::OK(); +} + +// Wait for the 'thread pool' to become idle +// including running tasks from other pools if +// needed +void ThreadPool::WaitForIdle() { + while (!state_->task_queue.empty()) { + RunTasksOnAllExecutors(); + } +} + +Status ThreadPool::SetCapacity(int threads) { + state_->max_tasks_running = threads; + return Status::OK(); +} + +int ThreadPool::GetCapacity() { return state_->max_tasks_running; } + +int ThreadPool::GetActualCapacity() { return state_->max_tasks_running; } + +Result> ThreadPool::Make(int threads) { + auto pool = std::shared_ptr(new ThreadPool()); + RETURN_NOT_OK(pool->SetCapacity(threads)); + return pool; +} + +Result> ThreadPool::MakeEternal(int threads) { + ARROW_ASSIGN_OR_RAISE(auto pool, Make(threads)); + // On Windows, the ThreadPool destructor may be called after non-main threads + // have been killed by the OS, and hang in a condition variable. + // On Unix, we want to avoid leak reports by Valgrind. + return pool; +} + +ThreadPool::~ThreadPool() { + // clear threadpool, otherwise ~SerialExecutor will + // run any tasks left (which isn't threadpool behaviour) + state_->task_queue.clear(); +} + +#endif // ARROW_ENABLE_THREADING + // Helper for the singleton pattern std::shared_ptr ThreadPool::MakeCpuThreadPool() { auto maybe_pool = ThreadPool::MakeEternal(ThreadPool::DefaultCapacity()); diff --git a/cpp/src/arrow/util/thread_pool.h b/cpp/src/arrow/util/thread_pool.h index 4e0fd84068c91..eba79fc05d7e0 100644 --- a/cpp/src/arrow/util/thread_pool.h +++ b/cpp/src/arrow/util/thread_pool.h @@ -21,11 +21,13 @@ #include #include #include +#include #include #include "arrow/result.h" #include "arrow/status.h" #include "arrow/util/cancel.h" +#include "arrow/util/config.h" #include "arrow/util/functional.h" #include "arrow/util/future.h" #include "arrow/util/iterator.h" @@ -194,6 +196,11 @@ class ARROW_EXPORT Executor { // Executor. Returns false if this Executor does not support this property. virtual bool OwnsThisThread() { return false; } + // Return true if this is the current executor being called + // n.b. this defaults to just calling OwnsThisThread + // unless the threadpool is disabled + virtual bool IsCurrentExecutor() { return OwnsThisThread(); } + /// \brief An interface to represent something with a custom destructor /// /// \see KeepAlive @@ -276,6 +283,9 @@ class ARROW_EXPORT SerialExecutor : public Executor { Status SpawnReal(TaskHints hints, FnOnce task, StopToken, StopCallback&&) override; + // Return the number of tasks either running or in the queue. + int GetNumTasks(); + /// \brief Runs the TopLevelTask and any scheduled tasks /// /// The TopLevelTask (or one of the tasks it schedules) must either return an invalid @@ -347,8 +357,13 @@ class ARROW_EXPORT SerialExecutor : public Executor { // the next call. executor->Pause(); }); +#ifdef ARROW_ENABLE_THREADING + // future must run on this thread // Borrow this thread and run tasks until the future is finished executor->RunLoop(); +#else + next_fut.Wait(); +#endif if (!next_fut.is_finished()) { // Not clear this is possible since RunLoop wouldn't generally exit // unless we paused/finished which would imply next_fut has been @@ -367,14 +382,26 @@ class ARROW_EXPORT SerialExecutor : public Executor { return Iterator(SerialIterator{std::move(serial_executor), std::move(generator)}); } - private: - SerialExecutor(); +#ifndef ARROW_ENABLE_THREADING + // run a pending task from loop + // returns true if any tasks were run in the last go round the loop (i.e. if it + // returns false, all executors are waiting) + static bool RunTasksOnAllExecutors(); + static SerialExecutor* GetCurrentExecutor(); + + bool IsCurrentExecutor() override; + +#endif + + protected: + virtual void RunLoop(); // State uses mutex struct State; std::shared_ptr state_; - void RunLoop(); + SerialExecutor(); + // We mark the serial executor "finished" when there should be // no more tasks scheduled on it. It's not strictly needed but // can help catch bugs where we are trying to use the executor @@ -393,8 +420,23 @@ class ARROW_EXPORT SerialExecutor : public Executor { RunLoop(); return final_fut; } + +#ifndef ARROW_ENABLE_THREADING + // we have to run tasks from all live executors + // during RunLoop if we don't have threading + static std::unordered_set all_executors; + // a pointer to the last one called by the loop + // so all tasks get spawned equally + // on multiple calls to RunTasksOnAllExecutors + static SerialExecutor* last_called_executor; + // without threading we can't tell which executor called the + // current process - so we set it in spawning the task + static SerialExecutor* current_executor; +#endif // ARROW_ENABLE_THREADING }; +#ifdef ARROW_ENABLE_THREADING + /// An Executor implementation spawning tasks in FIFO manner on a fixed-size /// pool of worker threads. /// @@ -418,11 +460,10 @@ class ARROW_EXPORT ThreadPool : public Executor { // match this value. int GetCapacity() override; - bool OwnsThisThread() override; - // Return the number of tasks either running or in the queue. int GetNumTasks(); + bool OwnsThisThread() override; // Dynamically change the number of worker threads. // // This function always returns immediately. @@ -475,6 +516,58 @@ class ARROW_EXPORT ThreadPool : public Executor { State* state_; bool shutdown_on_destroy_; }; +#else // ARROW_ENABLE_THREADING +// an executor implementation which pretends to be a thread pool but runs everything +// on the main thread using a static queue (shared between all thread pools, otherwise +// cross-threadpool dependencies will break everything) +class ARROW_EXPORT ThreadPool : public SerialExecutor { + public: + ARROW_FRIEND_EXPORT friend ThreadPool* GetCpuThreadPool(); + + static Result> Make(int threads); + + // Like Make(), but takes care that the returned ThreadPool is compatible + // with destruction late at process exit. + static Result> MakeEternal(int threads); + + // Destroy thread pool; the pool will first be shut down + ~ThreadPool() override; + + // Return the desired number of worker threads. + // The actual number of workers may lag a bit before being adjusted to + // match this value. + int GetCapacity() override; + + virtual int GetActualCapacity(); + + bool OwnsThisThread() override { return true; } + + // Dynamically change the number of worker threads. + // without threading this is equal to the + // number of tasks that can be running at once + // (inside each other) + Status SetCapacity(int threads); + + static int DefaultCapacity() { return 8; } + + // Shutdown the pool. Once the pool starts shutting down, new tasks + // cannot be submitted anymore. + // If "wait" is true, shutdown waits for all pending tasks to be finished. + // If "wait" is false, workers are stopped as soon as currently executing + // tasks are finished. + Status Shutdown(bool wait = true); + + // Wait for the thread pool to become idle + // + // This is useful for sequencing tests + void WaitForIdle(); + + protected: + static std::shared_ptr MakeCpuThreadPool(); + ThreadPool(); +}; + +#endif // ARROW_ENABLE_THREADING // Return the process-global thread pool for CPU-bound tasks. ARROW_EXPORT ThreadPool* GetCpuThreadPool(); diff --git a/cpp/src/arrow/util/thread_pool_test.cc b/cpp/src/arrow/util/thread_pool_test.cc index bce07d6908a11..ad30ca2e8052d 100644 --- a/cpp/src/arrow/util/thread_pool_test.cc +++ b/cpp/src/arrow/util/thread_pool_test.cc @@ -37,6 +37,7 @@ #include "arrow/testing/executor_util.h" #include "arrow/testing/future_util.h" #include "arrow/testing/gtest_util.h" +#include "arrow/util/config.h" #include "arrow/util/io_util.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" @@ -583,6 +584,9 @@ TEST_F(TestThreadPool, StressSpawn) { } TEST_F(TestThreadPool, OwnsCurrentThread) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif auto pool = this->MakeThreadPool(30); std::atomic one_failed{false}; @@ -600,6 +604,10 @@ TEST_F(TestThreadPool, OwnsCurrentThread) { } TEST_F(TestThreadPool, StressSpawnThreaded) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + auto pool = this->MakeThreadPool(30); SpawnAddsThreaded(pool.get(), 20, 100, task_add); } @@ -616,6 +624,9 @@ TEST_F(TestThreadPool, StressSpawnSlow) { } TEST_F(TestThreadPool, StressSpawnSlowThreaded) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif auto pool = this->MakeThreadPool(30); SpawnAddsThreaded(pool.get(), 20, 100, task_slow_add{/*seconds=*/0.002}); } @@ -627,6 +638,9 @@ TEST_F(TestThreadPool, SpawnWithStopToken) { } TEST_F(TestThreadPool, StressSpawnThreadedWithStopToken) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif StopSource stop_source; auto pool = this->MakeThreadPool(30); SpawnAddsThreaded(pool.get(), 20, 100, task_add, stop_source.token()); @@ -639,6 +653,9 @@ TEST_F(TestThreadPool, SpawnWithStopTokenCancelled) { } TEST_F(TestThreadPool, StressSpawnThreadedWithStopTokenCancelled) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif StopSource stop_source; auto pool = this->MakeThreadPool(30); SpawnAddsThreadedAndCancel(pool.get(), 20, 100, task_slow_add{/*seconds=*/0.02}, @@ -656,6 +673,7 @@ TEST_F(TestThreadPool, QuickShutdown) { add_tester.CheckNotAllComputed(); } +#ifdef ARROW_ENABLE_THREADING TEST_F(TestThreadPool, SetCapacity) { auto pool = this->MakeThreadPool(5); @@ -717,7 +735,17 @@ TEST_F(TestThreadPool, SetCapacity) { // Ensure nothing got stuck ASSERT_OK(pool->Shutdown()); } +#else // ARROW_ENABLE_THREADING +TEST_F(TestThreadPool, SetCapacity) { + auto pool = this->MakeThreadPool(5); + + ASSERT_EQ(pool->GetCapacity(), 5); + ASSERT_EQ(pool->GetActualCapacity(), 5); + ASSERT_OK(pool->SetCapacity(7)); + ASSERT_EQ(pool->GetCapacity(), 7); +} +#endif // Test Submit() functionality TEST_F(TestThreadPool, Submit) { @@ -802,6 +830,10 @@ class TestThreadPoolForkSafety : public TestThreadPool {}; TEST_F(TestThreadPoolForkSafety, Basics) { { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + // Fork after task submission auto pool = this->MakeThreadPool(3); ASSERT_OK_AND_ASSIGN(auto fut, pool->Submit(add, 4, 5)); @@ -845,6 +877,9 @@ TEST_F(TestThreadPoolForkSafety, Basics) { } TEST_F(TestThreadPoolForkSafety, MultipleChildThreads) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif // ARROW-15593: race condition in after-fork ThreadPool reinitialization // when SpawnReal() was called from multiple threads in a forked child. auto run_in_child = [](ThreadPool* pool) { @@ -894,6 +929,9 @@ TEST_F(TestThreadPoolForkSafety, NestedChild) { { #ifdef __APPLE__ GTEST_SKIP() << "Nested fork is not supported on macos"; +#endif +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; #endif auto pool = this->MakeThreadPool(3); ASSERT_OK_AND_ASSIGN(auto fut, pool->Submit(add, 4, 5)); @@ -928,6 +966,9 @@ TEST_F(TestThreadPoolForkSafety, NestedChild) { #endif TEST(TestGlobalThreadPool, Capacity) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif // Sanity check auto pool = GetCpuThreadPool(); int capacity = pool->GetCapacity(); diff --git a/cpp/src/parquet/encryption/key_management_test.cc b/cpp/src/parquet/encryption/key_management_test.cc index f733c43ee1e79..6f80ab42c9a96 100644 --- a/cpp/src/parquet/encryption/key_management_test.cc +++ b/cpp/src/parquet/encryption/key_management_test.cc @@ -324,6 +324,9 @@ TEST_F(TestEncryptionKeyManagement, KeyRotationWithInternalMaterial) { } TEST_F(TestEncryptionKeyManagementMultiThread, WrapLocally) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif this->SetupCryptoFactory(true); this->WriteEncryptedParquetFiles(); @@ -331,6 +334,9 @@ TEST_F(TestEncryptionKeyManagementMultiThread, WrapLocally) { } TEST_F(TestEncryptionKeyManagementMultiThread, WrapOnServer) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif this->SetupCryptoFactory(false); this->WriteEncryptedParquetFiles(); diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 941506b9c2abc..bad5b3c1f8988 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1225,6 +1225,15 @@ tasks: image: ubuntu-cpp {% endfor %} + test-ubuntu-22.04-cpp-no-threading: + ci: github + template: docker-tests/github.linux.yml + params: + env: + UBUNTU: 22.04 + flags: "-e ARROW_ENABLE_THREADING=OFF" + image: ubuntu-cpp + test-ubuntu-20.04-cpp-thread-sanitizer: ci: github template: docker-tests/github.linux.yml From 2972e494ff906a2261db5e67b305c5af9d812544 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 9 Aug 2023 10:52:58 +0100 Subject: [PATCH 123/749] GH-37019: [R] Documentation for read_parquet() et al needs updating (#37020) ### Rationale for this change Docs were out of data with code after previous changes to returned object type ### What changes are included in this PR? Update docs to reflect correct return type ### Are these changes tested? No ### Are there any user-facing changes? No * Closes: #37019 Authored-by: Nic Crane Signed-off-by: Nic Crane --- r/R/csv.R | 6 +++--- r/R/dplyr-funcs-doc.R | 2 +- r/R/feather.R | 2 +- r/R/ipc-stream.R | 4 ++-- r/R/json.R | 2 +- r/R/parquet.R | 4 ++-- r/man/acero.Rd | 2 +- r/man/read_delim_arrow.Rd | 6 +++--- r/man/read_feather.Rd | 4 ++-- r/man/read_ipc_stream.Rd | 4 ++-- r/man/read_json_arrow.Rd | 4 ++-- r/man/read_parquet.Rd | 6 +++--- 12 files changed, 23 insertions(+), 23 deletions(-) diff --git a/r/R/csv.R b/r/R/csv.R index c8a13630d2f58..d53dc07b42454 100644 --- a/r/R/csv.R +++ b/r/R/csv.R @@ -17,7 +17,7 @@ #' Read a CSV or other delimited file with Arrow #' -#' These functions uses the Arrow C++ CSV reader to read into a `data.frame`. +#' These functions uses the Arrow C++ CSV reader to read into a `tibble`. #' Arrow C++ options have been mapped to argument names that follow those of #' `readr::read_delim()`, and `col_select` was inspired by `vroom::vroom()`. #' @@ -127,10 +127,10 @@ #' parsing options provided in other arguments (e.g. `delim`, `quote`, etc.). #' @param convert_options see [file reader options][CsvReadOptions] #' @param read_options see [file reader options][CsvReadOptions] -#' @param as_data_frame Should the function return a `data.frame` (default) or +#' @param as_data_frame Should the function return a `tibble` (default) or #' an Arrow [Table]? #' -#' @return A `data.frame`, or a Table if `as_data_frame = FALSE`. +#' @return A `tibble`, or a Table if `as_data_frame = FALSE`. #' @export #' @examples #' tf <- tempfile() diff --git a/r/R/dplyr-funcs-doc.R b/r/R/dplyr-funcs-doc.R index a472c572cd9f4..5099e903da670 100644 --- a/r/R/dplyr-funcs-doc.R +++ b/r/R/dplyr-funcs-doc.R @@ -34,7 +34,7 @@ #' to a `dbplyr::tbl_lazy`. This means that the verbs do not eagerly evaluate #' the query on the data. To run the query, call either `compute()`, #' which returns an `arrow` [Table], or `collect()`, which pulls the resulting -#' Table into an R `data.frame`. +#' Table into an R `tibble`. #' #' * [`anti_join()`][dplyr::anti_join()]: the `copy` and `na_matches` arguments are ignored #' * [`arrange()`][dplyr::arrange()] diff --git a/r/R/feather.R b/r/R/feather.R index 24971669fc536..3e390018c825f 100644 --- a/r/R/feather.R +++ b/r/R/feather.R @@ -157,7 +157,7 @@ write_ipc_file <- function(x, #' @inheritParams read_delim_arrow #' @inheritParams make_readable_file #' -#' @return A `data.frame` if `as_data_frame` is `TRUE` (the default), or an +#' @return A `tibble` if `as_data_frame` is `TRUE` (the default), or an #' Arrow [Table] otherwise #' #' @export diff --git a/r/R/ipc-stream.R b/r/R/ipc-stream.R index 7144132393446..37ef0bbaf2126 100644 --- a/r/R/ipc-stream.R +++ b/r/R/ipc-stream.R @@ -87,11 +87,11 @@ write_to_raw <- function(x, format = c("stream", "file")) { #' If a file name or URI, an Arrow [InputStream] will be opened and #' closed when finished. If an input stream is provided, it will be left #' open. -#' @param as_data_frame Should the function return a `data.frame` (default) or +#' @param as_data_frame Should the function return a `tibble` (default) or #' an Arrow [Table]? #' @param ... extra parameters passed to `read_feather()`. #' -#' @return A `data.frame` if `as_data_frame` is `TRUE` (the default), or an +#' @return A `tibble` if `as_data_frame` is `TRUE` (the default), or an #' Arrow [Table] otherwise #' @seealso [write_feather()] for writing IPC files. [RecordBatchReader] for a #' lower-level interface. diff --git a/r/R/json.R b/r/R/json.R index e8131b37f2584..8936cc3932dcd 100644 --- a/r/R/json.R +++ b/r/R/json.R @@ -38,7 +38,7 @@ #' @param schema [Schema] that describes the table. #' @param ... Additional options passed to `JsonTableReader$create()` #' -#' @return A `data.frame`, or a Table if `as_data_frame = FALSE`. +#' @return A `tibble`, or a Table if `as_data_frame = FALSE`. #' @export #' @examplesIf arrow_with_json() #' tf <- tempfile() diff --git a/r/R/parquet.R b/r/R/parquet.R index 1335e85219211..db224a41e4019 100644 --- a/r/R/parquet.R +++ b/r/R/parquet.R @@ -24,8 +24,8 @@ #' @param props [ParquetArrowReaderProperties] #' @param ... Additional arguments passed to `ParquetFileReader$create()` #' -#' @return A [arrow::Table][Table], or a `data.frame` if `as_data_frame` is -#' `TRUE` (the default). +#' @return A `tibble` if `as_data_frame` is `TRUE` (the default), or an +#' Arrow [Table] otherwise. #' @examplesIf arrow_with_parquet() && !getFromNamespace("on_linux_dev", "arrow")() #' tf <- tempfile() #' on.exit(unlink(tf)) diff --git a/r/man/acero.Rd b/r/man/acero.Rd index 0a35bccbc7491..c9fb4d37a1d95 100644 --- a/r/man/acero.Rd +++ b/r/man/acero.Rd @@ -21,7 +21,7 @@ Most verb functions return an \code{arrow_dplyr_query} object, similar in spirit to a \code{dbplyr::tbl_lazy}. This means that the verbs do not eagerly evaluate the query on the data. To run the query, call either \code{compute()}, which returns an \code{arrow} \link{Table}, or \code{collect()}, which pulls the resulting -Table into an R \code{data.frame}. +Table into an R \code{tibble}. \itemize{ \item \code{\link[dplyr:filter-joins]{anti_join()}}: the \code{copy} and \code{na_matches} arguments are ignored \item \code{\link[dplyr:arrange]{arrange()}} diff --git a/r/man/read_delim_arrow.Rd b/r/man/read_delim_arrow.Rd index f15da07afb87d..cd07c8ad07e5f 100644 --- a/r/man/read_delim_arrow.Rd +++ b/r/man/read_delim_arrow.Rd @@ -128,7 +128,7 @@ parsing options provided in other arguments (e.g. \code{delim}, \code{quote}, et \item{read_options}{see \link[=CsvReadOptions]{file reader options}} -\item{as_data_frame}{Should the function return a \code{data.frame} (default) or +\item{as_data_frame}{Should the function return a \code{tibble} (default) or an Arrow \link{Table}?} \item{timestamp_parsers}{User-defined timestamp parsers. If more than one @@ -141,10 +141,10 @@ starting from the beginning of this vector. Possible values are: }} } \value{ -A \code{data.frame}, or a Table if \code{as_data_frame = FALSE}. +A \code{tibble}, or a Table if \code{as_data_frame = FALSE}. } \description{ -These functions uses the Arrow C++ CSV reader to read into a \code{data.frame}. +These functions uses the Arrow C++ CSV reader to read into a \code{tibble}. Arrow C++ options have been mapped to argument names that follow those of \code{readr::read_delim()}, and \code{col_select} was inspired by \code{vroom::vroom()}. } diff --git a/r/man/read_feather.Rd b/r/man/read_feather.Rd index 000aa541aacf6..c3b4a54158c7f 100644 --- a/r/man/read_feather.Rd +++ b/r/man/read_feather.Rd @@ -21,13 +21,13 @@ open.} \link[tidyselect:eval_select]{tidy selection specification} of columns, as used in \code{dplyr::select()}.} -\item{as_data_frame}{Should the function return a \code{data.frame} (default) or +\item{as_data_frame}{Should the function return a \code{tibble} (default) or an Arrow \link{Table}?} \item{mmap}{Logical: whether to memory-map the file (default \code{TRUE})} } \value{ -A \code{data.frame} if \code{as_data_frame} is \code{TRUE} (the default), or an +A \code{tibble} if \code{as_data_frame} is \code{TRUE} (the default), or an Arrow \link{Table} otherwise } \description{ diff --git a/r/man/read_ipc_stream.Rd b/r/man/read_ipc_stream.Rd index 63b50e7c1b002..db930b52bde18 100644 --- a/r/man/read_ipc_stream.Rd +++ b/r/man/read_ipc_stream.Rd @@ -13,13 +13,13 @@ If a file name or URI, an Arrow \link{InputStream} will be opened and closed when finished. If an input stream is provided, it will be left open.} -\item{as_data_frame}{Should the function return a \code{data.frame} (default) or +\item{as_data_frame}{Should the function return a \code{tibble} (default) or an Arrow \link{Table}?} \item{...}{extra parameters passed to \code{read_feather()}.} } \value{ -A \code{data.frame} if \code{as_data_frame} is \code{TRUE} (the default), or an +A \code{tibble} if \code{as_data_frame} is \code{TRUE} (the default), or an Arrow \link{Table} otherwise } \description{ diff --git a/r/man/read_json_arrow.Rd b/r/man/read_json_arrow.Rd index 7231f5a83472f..9230a9a017495 100644 --- a/r/man/read_json_arrow.Rd +++ b/r/man/read_json_arrow.Rd @@ -28,7 +28,7 @@ To be recognised as literal data, the input must be wrapped with \code{I()}.} \link[tidyselect:eval_select]{tidy selection specification} of columns, as used in \code{dplyr::select()}.} -\item{as_data_frame}{Should the function return a \code{data.frame} (default) or +\item{as_data_frame}{Should the function return a \code{tibble} (default) or an Arrow \link{Table}?} \item{schema}{\link{Schema} that describes the table.} @@ -36,7 +36,7 @@ an Arrow \link{Table}?} \item{...}{Additional options passed to \code{JsonTableReader$create()}} } \value{ -A \code{data.frame}, or a Table if \code{as_data_frame = FALSE}. +A \code{tibble}, or a Table if \code{as_data_frame = FALSE}. } \description{ Wrapper around \link{JsonTableReader} to read a newline-delimited JSON (ndjson) file into a diff --git a/r/man/read_parquet.Rd b/r/man/read_parquet.Rd index 68e56903d1485..3bb76cc2e30ca 100644 --- a/r/man/read_parquet.Rd +++ b/r/man/read_parquet.Rd @@ -24,7 +24,7 @@ open.} \link[tidyselect:eval_select]{tidy selection specification} of columns, as used in \code{dplyr::select()}.} -\item{as_data_frame}{Should the function return a \code{data.frame} (default) or +\item{as_data_frame}{Should the function return a \code{tibble} (default) or an Arrow \link{Table}?} \item{props}{\link{ParquetArrowReaderProperties}} @@ -32,8 +32,8 @@ an Arrow \link{Table}?} \item{...}{Additional arguments passed to \code{ParquetFileReader$create()}} } \value{ -A \link[=Table]{arrow::Table}, or a \code{data.frame} if \code{as_data_frame} is -\code{TRUE} (the default). +A \code{tibble} if \code{as_data_frame} is \code{TRUE} (the default), or an +Arrow \link{Table} otherwise. } \description{ '\href{https://parquet.apache.org/}{Parquet}' is a columnar storage file format. From d3ccc833a61b70a988090cd8065d3e38d7c29a89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B0=A2=E5=A4=A9?= Date: Wed, 9 Aug 2023 19:33:37 +0800 Subject: [PATCH 124/749] GH-36867: [C++] Add a struct_ and schema overload taking a vector of (name, type) pairs (#36915) ### Rationale for this change Mostly for convenience. It would be nice to be able to write: ```struct_({{"a", int8()}, {"b", utf8()}});``` instead of: ```struct_({field("a", int8()), field("b", utf8())});``` Same with the schema factory. ### What changes are included in this PR? Add a struct_ overload and two schema overload taking a vector of (name, type) pairs to construct a vector of fields. ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. Add three ARROW_EXPORT functions. * Closes: #36867 Authored-by: jsjtxietian Signed-off-by: Antoine Pitrou --- .../arrow/engine/substrait/type_internal.cc | 7 +++-- cpp/src/arrow/type.cc | 28 ++++++++++++++++++ cpp/src/arrow/type_fwd.h | 29 +++++++++++++++++++ cpp/src/arrow/type_test.cc | 13 +++++++++ cpp/src/parquet/arrow/schema.cc | 2 +- 5 files changed, 75 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/engine/substrait/type_internal.cc b/cpp/src/arrow/engine/substrait/type_internal.cc index 03d1f999a1491..1f9141f36ba6b 100644 --- a/cpp/src/arrow/engine/substrait/type_internal.cc +++ b/cpp/src/arrow/engine/substrait/type_internal.cc @@ -77,9 +77,10 @@ Result FieldsFromProto(int size, const Types& types, if (types.Get(i).has_struct_()) { const auto& struct_ = types.Get(i).struct_(); - ARROW_ASSIGN_OR_RAISE(type, FieldsFromProto(struct_.types_size(), struct_.types(), - next_name, ext_set, conversion_options) - .Map(arrow::struct_)); + ARROW_ASSIGN_OR_RAISE( + auto fields, FieldsFromProto(struct_.types_size(), struct_.types(), next_name, + ext_set, conversion_options)); + type = ::arrow::struct_(std::move(fields)); nullable = IsNullable(struct_); } else { diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 9267f1e499720..86df91268f8b9 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -276,6 +276,17 @@ std::shared_ptr MaybePromoteNullTypes(const Field& existing, const Field& // `other` must be null. return existing.WithNullable(true); } + +std::vector> MakeFields( + std::initializer_list>> init_list) { + std::vector> fields; + fields.reserve(init_list.size()); + for (const auto& [name, type] : init_list) { + fields.push_back(field(name, type)); + } + return fields; +} + } // namespace Field::~Field() {} @@ -2125,12 +2136,24 @@ std::shared_ptr schema(std::vector> fields, return std::make_shared(std::move(fields), std::move(metadata)); } +std::shared_ptr schema( + std::initializer_list>> fields, + std::shared_ptr metadata) { + return std::make_shared(MakeFields(fields), std::move(metadata)); +} + std::shared_ptr schema(std::vector> fields, Endianness endianness, std::shared_ptr metadata) { return std::make_shared(std::move(fields), endianness, std::move(metadata)); } +std::shared_ptr schema( + std::initializer_list>> fields, + Endianness endianness, std::shared_ptr metadata) { + return std::make_shared(MakeFields(fields), endianness, std::move(metadata)); +} + Result> UnifySchemas( const std::vector>& schemas, const Field::MergeOptions field_merge_options) { @@ -2641,6 +2664,11 @@ std::shared_ptr struct_(const std::vector>& fie return std::make_shared(fields); } +std::shared_ptr struct_( + std::initializer_list>> fields) { + return std::make_shared(MakeFields(fields)); +} + std::shared_ptr run_end_encoded(std::shared_ptr run_end_type, std::shared_ptr value_type) { return std::make_shared(std::move(run_end_type), diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index a8a27139d11bc..d3b41c8158cf7 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -560,6 +560,10 @@ ARROW_EXPORT std::shared_ptr time64(TimeUnit::type unit); ARROW_EXPORT std::shared_ptr struct_( const std::vector>& fields); +/// \brief Create a StructType instance from (name, type) pairs +ARROW_EXPORT std::shared_ptr struct_( + std::initializer_list>> fields); + /// \brief Create a RunEndEncodedType instance ARROW_EXPORT std::shared_ptr run_end_encoded( std::shared_ptr run_end_type, std::shared_ptr value_type); @@ -629,6 +633,18 @@ std::shared_ptr schema( std::vector> fields, std::shared_ptr metadata = NULLPTR); +/// \brief Create a Schema instance from (name, type) pairs +/// +/// The schema's fields will all be nullable with no associated metadata. +/// +/// \param fields (name, type) pairs of the schema's fields +/// \param metadata any custom key-value metadata, default null +/// \return schema shared_ptr to Schema +ARROW_EXPORT +std::shared_ptr schema( + std::initializer_list>> fields, + std::shared_ptr metadata = NULLPTR); + /// \brief Create a Schema instance /// /// \param fields the schema's fields @@ -640,6 +656,19 @@ std::shared_ptr schema( std::vector> fields, Endianness endianness, std::shared_ptr metadata = NULLPTR); +/// \brief Create a Schema instance +/// +/// The schema's fields will all be nullable with no associated metadata. +/// +/// \param fields (name, type) pairs of the schema's fields +/// \param endianness the endianness of the data +/// \param metadata any custom key-value metadata, default null +/// \return schema shared_ptr to Schema +ARROW_EXPORT +std::shared_ptr schema( + std::initializer_list>> fields, + Endianness endianness, std::shared_ptr metadata = NULLPTR); + /// @} /// Return the process-wide default memory pool. diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index b008929e87a19..c55b33b4151e4 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -414,6 +414,13 @@ TEST_F(TestSchema, Basics) { ASSERT_NE(schema4->fingerprint(), schema7->fingerprint()); ASSERT_EQ(schema6->fingerprint(), schema7->fingerprint()); #endif + + auto schema8 = ::arrow::schema({field("f0", int8()), field("f1", int32())}); + auto schema9 = ::arrow::schema({{"f0", int8()}, {"f1", int32()}}); + auto schema10 = ::arrow::schema({{"f2", int8()}, {"f1", int32()}}); + + AssertSchemaEqual(schema8, schema9); + AssertSchemaNotEqual(schema8, schema10); } TEST_F(TestSchema, ToString) { @@ -1479,6 +1486,12 @@ TEST(TestStructType, Basics) { ASSERT_EQ(struct_type.ToString(), "struct"); + auto t1 = struct_({{"a", int8()}, {"b", utf8()}}); + auto t2 = struct_({field("a", int8()), field("b", utf8())}); + auto t3 = struct_({field("c", int8()), field("b", utf8())}); + ASSERT_TRUE(t1->Equals(t2)); + ASSERT_TRUE(!t1->Equals(t3)); + // TODO(wesm): out of bounds for field(...) } diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index f713548d05a70..3323b7ff8b608 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -839,7 +839,7 @@ std::function(FieldVector)> GetNestedFactory( switch (inferred_type.id()) { case ::arrow::Type::STRUCT: if (origin_type.id() == ::arrow::Type::STRUCT) { - return ::arrow::struct_; + return [](FieldVector fields) { return ::arrow::struct_(std::move(fields)); }; } break; case ::arrow::Type::LIST: From 1a00fecf7dc1d75bfb5d31b6c5fae4b3d646bf1b Mon Sep 17 00:00:00 2001 From: pegasas <616672335@qq.com> Date: Wed, 9 Aug 2023 22:44:34 +0800 Subject: [PATCH 125/749] GH-36674: [C++] Use anonymous namespace in arrow/ipc/reader.cc (#36937) ### Rationale for this change The following types and functions should be in file scope: IpcReadConext, BatchDataReadRequest, ArrayLoader, DecompressBuffer(s), LoadRecordBatch*, GetCompression*, ReadRecordBatchInternal, GetInclusionMaskAndOutSchmea, UnpackSchemaMessage, ReadDictionary, AsyncRecordBatchStreamReaderImpl. ### What changes are included in this PR? Use anonymous namespace around the aforementioned definitions. ### Are these changes tested? No ### Are there any user-facing changes? No Lead-authored-by: pegasas <616672335@qq.com> Co-authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/arrow/ipc/reader.cc | 81 ++++++++++++++++++------------------- 1 file changed, 40 insertions(+), 41 deletions(-) diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index eadba69f05612..125a8f2d4158c 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -109,8 +109,6 @@ Status InvalidMessageType(MessageType expected, MessageType actual) { } \ } while (0) -} // namespace - // ---------------------------------------------------------------------- // Record batch read path @@ -634,8 +632,7 @@ Status GetCompressionExperimental(const flatbuf::Message* message, return Status::OK(); } -static Status ReadContiguousPayload(io::InputStream* file, - std::unique_ptr* message) { +Status ReadContiguousPayload(io::InputStream* file, std::unique_ptr* message) { ARROW_ASSIGN_OR_RAISE(*message, ReadMessage(file)); if (*message == nullptr) { return Status::Invalid("Unable to read metadata at offset"); @@ -643,27 +640,6 @@ static Status ReadContiguousPayload(io::InputStream* file, return Status::OK(); } -Result> ReadRecordBatch( - const std::shared_ptr& schema, const DictionaryMemo* dictionary_memo, - const IpcReadOptions& options, io::InputStream* file) { - std::unique_ptr message; - RETURN_NOT_OK(ReadContiguousPayload(file, &message)); - CHECK_HAS_BODY(*message); - ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body())); - return ReadRecordBatch(*message->metadata(), schema, dictionary_memo, options, - reader.get()); -} - -Result> ReadRecordBatch( - const Message& message, const std::shared_ptr& schema, - const DictionaryMemo* dictionary_memo, const IpcReadOptions& options) { - CHECK_MESSAGE_TYPE(MessageType::RECORD_BATCH, message.type()); - CHECK_HAS_BODY(message); - ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message.body())); - return ReadRecordBatch(*message.metadata(), schema, dictionary_memo, options, - reader.get()); -} - Result ReadRecordBatchInternal( const Buffer& metadata, const std::shared_ptr& schema, const std::vector& inclusion_mask, IpcReadContext& context, @@ -764,22 +740,6 @@ Status UnpackSchemaMessage(const Message& message, const IpcReadOptions& options out_schema, field_inclusion_mask, swap_endian); } -Result> ReadRecordBatch( - const Buffer& metadata, const std::shared_ptr& schema, - const DictionaryMemo* dictionary_memo, const IpcReadOptions& options, - io::RandomAccessFile* file) { - std::shared_ptr out_schema; - // Empty means do not use - std::vector inclusion_mask; - IpcReadContext context(const_cast(dictionary_memo), options, false); - RETURN_NOT_OK(GetInclusionMaskAndOutSchema(schema, context.options.included_fields, - &inclusion_mask, &out_schema)); - ARROW_ASSIGN_OR_RAISE( - auto batch_and_custom_metadata, - ReadRecordBatchInternal(metadata, schema, inclusion_mask, context, file)); - return batch_and_custom_metadata.batch; -} - Status ReadDictionary(const Buffer& metadata, const IpcReadContext& context, DictionaryKind* kind, io::RandomAccessFile* file) { const flatbuf::Message* message = nullptr; @@ -851,6 +811,45 @@ Status ReadDictionary(const Message& message, const IpcReadContext& context, return ReadDictionary(*message.metadata(), context, kind, reader.get()); } +} // namespace + +Result> ReadRecordBatch( + const Buffer& metadata, const std::shared_ptr& schema, + const DictionaryMemo* dictionary_memo, const IpcReadOptions& options, + io::RandomAccessFile* file) { + std::shared_ptr out_schema; + // Empty means do not use + std::vector inclusion_mask; + IpcReadContext context(const_cast(dictionary_memo), options, false); + RETURN_NOT_OK(GetInclusionMaskAndOutSchema(schema, context.options.included_fields, + &inclusion_mask, &out_schema)); + ARROW_ASSIGN_OR_RAISE( + auto batch_and_custom_metadata, + ReadRecordBatchInternal(metadata, schema, inclusion_mask, context, file)); + return batch_and_custom_metadata.batch; +} + +Result> ReadRecordBatch( + const std::shared_ptr& schema, const DictionaryMemo* dictionary_memo, + const IpcReadOptions& options, io::InputStream* file) { + std::unique_ptr message; + RETURN_NOT_OK(ReadContiguousPayload(file, &message)); + CHECK_HAS_BODY(*message); + ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body())); + return ReadRecordBatch(*message->metadata(), schema, dictionary_memo, options, + reader.get()); +} + +Result> ReadRecordBatch( + const Message& message, const std::shared_ptr& schema, + const DictionaryMemo* dictionary_memo, const IpcReadOptions& options) { + CHECK_MESSAGE_TYPE(MessageType::RECORD_BATCH, message.type()); + CHECK_HAS_BODY(message); + ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message.body())); + return ReadRecordBatch(*message.metadata(), schema, dictionary_memo, options, + reader.get()); +} + // Streaming format decoder class StreamDecoderInternal : public MessageDecoderListener { public: From 9f183fc3e38abfd2edc492767280e6e917997e2c Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 9 Aug 2023 11:01:44 -0400 Subject: [PATCH 126/749] GH-36512: [C++][FlightRPC] Add async GetFlightInfo client call (#36517) ### Rationale for this change Async is a long-requested feature. ### What changes are included in this PR? Just the C++ implementation of async GetFlightInfo for the client. ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes, new APIs. * Closes: #36512 Authored-by: David Li Signed-off-by: David Li --- cpp/src/arrow/flight/CMakeLists.txt | 5 + cpp/src/arrow/flight/api.h | 1 + cpp/src/arrow/flight/client.cc | 58 ++++ cpp/src/arrow/flight/client.h | 28 ++ cpp/src/arrow/flight/flight_internals_test.cc | 6 +- cpp/src/arrow/flight/flight_test.cc | 25 +- .../arrow/flight/serialization_internal.cc | 24 +- cpp/src/arrow/flight/serialization_internal.h | 2 +- cpp/src/arrow/flight/test_definitions.cc | 267 +++++++++++++++++- cpp/src/arrow/flight/test_definitions.h | 26 ++ cpp/src/arrow/flight/transport.cc | 16 ++ cpp/src/arrow/flight/transport.h | 50 +++- .../flight/transport/grpc/grpc_client.cc | 194 ++++++++++++- .../flight/transport/grpc/util_internal.cc | 107 ++++++- .../flight/transport/grpc/util_internal.h | 8 + cpp/src/arrow/flight/type_fwd.h | 5 + cpp/src/arrow/flight/types.cc | 92 +++++- cpp/src/arrow/flight/types.h | 82 +++++- cpp/src/arrow/flight/types_async.h | 80 ++++++ 19 files changed, 1024 insertions(+), 52 deletions(-) create mode 100644 cpp/src/arrow/flight/types_async.h diff --git a/cpp/src/arrow/flight/CMakeLists.txt b/cpp/src/arrow/flight/CMakeLists.txt index 7383a7eec9045..6e76181533459 100644 --- a/cpp/src/arrow/flight/CMakeLists.txt +++ b/cpp/src/arrow/flight/CMakeLists.txt @@ -119,6 +119,11 @@ else() add_definitions(-DGRPC_NAMESPACE_FOR_TLS_CREDENTIALS_OPTIONS=grpc_impl::experimental) endif() +# Was in a different namespace, or simply not supported, prior to this +if(ARROW_GRPC_VERSION VERSION_GREATER_EQUAL "1.40") + add_definitions(-DGRPC_ENABLE_ASYNC) +endif() + # Restore the CXXFLAGS that were modified above set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS_BACKUP}") diff --git a/cpp/src/arrow/flight/api.h b/cpp/src/arrow/flight/api.h index 61c475dc20473..ed31b5c8fa41f 100644 --- a/cpp/src/arrow/flight/api.h +++ b/cpp/src/arrow/flight/api.h @@ -27,3 +27,4 @@ #include "arrow/flight/server_middleware.h" #include "arrow/flight/server_tracing_middleware.h" #include "arrow/flight/types.h" +#include "arrow/flight/types_async.h" diff --git a/cpp/src/arrow/flight/client.cc b/cpp/src/arrow/flight/client.cc index ec5377b7c11dc..eb62ec65ffc5f 100644 --- a/cpp/src/arrow/flight/client.cc +++ b/cpp/src/arrow/flight/client.cc @@ -32,6 +32,7 @@ #include "arrow/result.h" #include "arrow/status.h" #include "arrow/table.h" +#include "arrow/util/future.h" #include "arrow/util/logging.h" #include "arrow/flight/client_auth.h" @@ -39,11 +40,48 @@ #include "arrow/flight/transport.h" #include "arrow/flight/transport/grpc/grpc_client.h" #include "arrow/flight/types.h" +#include "arrow/flight/types_async.h" namespace arrow { namespace flight { +namespace { +template +class UnaryUnaryAsyncListener : public AsyncListener { + public: + UnaryUnaryAsyncListener() : future_(arrow::Future::Make()) {} + + void OnNext(T result) override { + DCHECK(!result_.ok()); + result_ = std::move(result); + } + + void OnFinish(Status status) override { + if (status.ok()) { + DCHECK(result_.ok()); + } else { + // Default-initialized result is not ok + DCHECK(!result_.ok()); + result_ = std::move(status); + } + future_.MarkFinished(std::move(result_)); + } + + static std::pair>, arrow::Future> Make() { + auto self = std::make_shared>(); + // Keep the listener alive by stashing it in the future + self->future_.AddCallback([self](const arrow::Result&) {}); + auto future = self->future_; + return std::make_pair(std::move(self), std::move(future)); + } + + private: + arrow::Result result_; + arrow::Future future_; +}; +} // namespace + const char* kWriteSizeDetailTypeId = "flight::FlightWriteSizeStatusDetail"; FlightCallOptions::FlightCallOptions() @@ -584,6 +622,24 @@ arrow::Result> FlightClient::GetFlightInfo( return info; } +void FlightClient::GetFlightInfoAsync( + const FlightCallOptions& options, const FlightDescriptor& descriptor, + std::shared_ptr> listener) { + if (auto status = CheckOpen(); !status.ok()) { + listener->OnFinish(std::move(status)); + return; + } + transport_->GetFlightInfoAsync(options, descriptor, std::move(listener)); +} + +arrow::Future FlightClient::GetFlightInfoAsync( + const FlightCallOptions& options, const FlightDescriptor& descriptor) { + RETURN_NOT_OK(CheckOpen()); + auto [listener, future] = UnaryUnaryAsyncListener::Make(); + transport_->GetFlightInfoAsync(options, descriptor, std::move(listener)); + return future; +} + arrow::Result> FlightClient::GetSchema( const FlightCallOptions& options, const FlightDescriptor& descriptor) { RETURN_NOT_OK(CheckOpen()); @@ -658,6 +714,8 @@ Status FlightClient::Close() { return Status::OK(); } +bool FlightClient::supports_async() const { return transport_->supports_async(); } + Status FlightClient::CheckOpen() const { if (closed_) { return Status::Invalid("FlightClient is closed"); diff --git a/cpp/src/arrow/flight/client.h b/cpp/src/arrow/flight/client.h index 7204b469a6127..cc1c35aaebed3 100644 --- a/cpp/src/arrow/flight/client.h +++ b/cpp/src/arrow/flight/client.h @@ -271,6 +271,31 @@ class ARROW_FLIGHT_EXPORT FlightClient { return GetFlightInfo({}, descriptor); } + /// \brief Asynchronous GetFlightInfo. + /// \param[in] options Per-RPC options + /// \param[in] descriptor the dataset request + /// \param[in] listener Callbacks for response and RPC completion + /// + /// This API is EXPERIMENTAL. + void GetFlightInfoAsync(const FlightCallOptions& options, + const FlightDescriptor& descriptor, + std::shared_ptr> listener); + void GetFlightInfoAsync(const FlightDescriptor& descriptor, + std::shared_ptr> listener) { + return GetFlightInfoAsync({}, descriptor, std::move(listener)); + } + + /// \brief Asynchronous GetFlightInfo returning a Future. + /// \param[in] options Per-RPC options + /// \param[in] descriptor the dataset request + /// + /// This API is EXPERIMENTAL. + arrow::Future GetFlightInfoAsync(const FlightCallOptions& options, + const FlightDescriptor& descriptor); + arrow::Future GetFlightInfoAsync(const FlightDescriptor& descriptor) { + return GetFlightInfoAsync({}, descriptor); + } + /// \brief Request schema for a single flight, which may be an existing /// dataset or a command to be executed /// \param[in] options Per-RPC options @@ -355,6 +380,9 @@ class ARROW_FLIGHT_EXPORT FlightClient { /// \since 8.0.0 Status Close(); + /// \brief Whether this client supports asynchronous methods. + bool supports_async() const; + private: FlightClient(); Status CheckOpen() const; diff --git a/cpp/src/arrow/flight/flight_internals_test.cc b/cpp/src/arrow/flight/flight_internals_test.cc index e56bab6db2092..72a25018e8ee8 100644 --- a/cpp/src/arrow/flight/flight_internals_test.cc +++ b/cpp/src/arrow/flight/flight_internals_test.cc @@ -76,9 +76,7 @@ void TestRoundtrip(const std::vector& values, ASSERT_OK(internal::ToProto(values[i], &pb_value)); if constexpr (std::is_same_v) { - FlightInfo::Data data; - ASSERT_OK(internal::FromProto(pb_value, &data)); - FlightInfo value(std::move(data)); + ASSERT_OK_AND_ASSIGN(FlightInfo value, internal::FromProto(pb_value)); EXPECT_EQ(values[i], value); } else if constexpr (std::is_same_v) { std::string data; @@ -742,5 +740,7 @@ TEST(TransportErrorHandling, ReconstructStatus) { ASSERT_EQ(detail->extra_info(), "Binary error details"); } +// TODO: test TransportStatusDetail + } // namespace flight } // namespace arrow diff --git a/cpp/src/arrow/flight/flight_test.cc b/cpp/src/arrow/flight/flight_test.cc index 1e7ea9bb002bb..c36c9eee71ba1 100644 --- a/cpp/src/arrow/flight/flight_test.cc +++ b/cpp/src/arrow/flight/flight_test.cc @@ -40,6 +40,7 @@ #include "arrow/testing/gtest_util.h" #include "arrow/testing/util.h" #include "arrow/util/base64.h" +#include "arrow/util/future.h" #include "arrow/util/logging.h" #ifdef GRPCPP_GRPCPP_H @@ -91,9 +92,16 @@ const char kAuthHeader[] = "authorization"; //------------------------------------------------------------ // Common transport tests +#ifdef GRPC_ENABLE_ASYNC +constexpr bool kGrpcSupportsAsync = true; +#else +constexpr bool kGrpcSupportsAsync = false; +#endif + class GrpcConnectivityTest : public ConnectivityTest, public ::testing::Test { protected: std::string transport() const override { return "grpc"; } + bool supports_async() const override { return kGrpcSupportsAsync; } void SetUp() override { SetUpTest(); } void TearDown() override { TearDownTest(); } }; @@ -102,6 +110,7 @@ ARROW_FLIGHT_TEST_CONNECTIVITY(GrpcConnectivityTest); class GrpcDataTest : public DataTest, public ::testing::Test { protected: std::string transport() const override { return "grpc"; } + bool supports_async() const override { return kGrpcSupportsAsync; } void SetUp() override { SetUpTest(); } void TearDown() override { TearDownTest(); } }; @@ -110,6 +119,7 @@ ARROW_FLIGHT_TEST_DATA(GrpcDataTest); class GrpcDoPutTest : public DoPutTest, public ::testing::Test { protected: std::string transport() const override { return "grpc"; } + bool supports_async() const override { return kGrpcSupportsAsync; } void SetUp() override { SetUpTest(); } void TearDown() override { TearDownTest(); } }; @@ -118,6 +128,7 @@ ARROW_FLIGHT_TEST_DO_PUT(GrpcDoPutTest); class GrpcAppMetadataTest : public AppMetadataTest, public ::testing::Test { protected: std::string transport() const override { return "grpc"; } + bool supports_async() const override { return kGrpcSupportsAsync; } void SetUp() override { SetUpTest(); } void TearDown() override { TearDownTest(); } }; @@ -126,6 +137,7 @@ ARROW_FLIGHT_TEST_APP_METADATA(GrpcAppMetadataTest); class GrpcIpcOptionsTest : public IpcOptionsTest, public ::testing::Test { protected: std::string transport() const override { return "grpc"; } + bool supports_async() const override { return kGrpcSupportsAsync; } void SetUp() override { SetUpTest(); } void TearDown() override { TearDownTest(); } }; @@ -134,6 +146,7 @@ ARROW_FLIGHT_TEST_IPC_OPTIONS(GrpcIpcOptionsTest); class GrpcCudaDataTest : public CudaDataTest, public ::testing::Test { protected: std::string transport() const override { return "grpc"; } + bool supports_async() const override { return kGrpcSupportsAsync; } void SetUp() override { SetUpTest(); } void TearDown() override { TearDownTest(); } }; @@ -142,11 +155,21 @@ ARROW_FLIGHT_TEST_CUDA_DATA(GrpcCudaDataTest); class GrpcErrorHandlingTest : public ErrorHandlingTest, public ::testing::Test { protected: std::string transport() const override { return "grpc"; } + bool supports_async() const override { return kGrpcSupportsAsync; } void SetUp() override { SetUpTest(); } void TearDown() override { TearDownTest(); } }; ARROW_FLIGHT_TEST_ERROR_HANDLING(GrpcErrorHandlingTest); +class GrpcAsyncClientTest : public AsyncClientTest, public ::testing::Test { + protected: + std::string transport() const override { return "grpc"; } + bool supports_async() const override { return kGrpcSupportsAsync; } + void SetUp() override { SetUpTest(); } + void TearDown() override { TearDownTest(); } +}; +ARROW_FLIGHT_TEST_ASYNC_CLIENT(GrpcAsyncClientTest); + //------------------------------------------------------------ // Ad-hoc gRPC-specific tests @@ -443,7 +466,7 @@ class TestTls : public ::testing::Test { Location location_; std::unique_ptr client_; std::unique_ptr server_; - bool server_is_initialized_; + bool server_is_initialized_ = false; }; // A server middleware that rejects all calls. diff --git a/cpp/src/arrow/flight/serialization_internal.cc b/cpp/src/arrow/flight/serialization_internal.cc index b0859e1d9164e..5d09a1a04586c 100644 --- a/cpp/src/arrow/flight/serialization_internal.cc +++ b/cpp/src/arrow/flight/serialization_internal.cc @@ -230,20 +230,21 @@ Status ToProto(const FlightDescriptor& descriptor, pb::FlightDescriptor* pb_desc // FlightInfo -Status FromProto(const pb::FlightInfo& pb_info, FlightInfo::Data* info) { - RETURN_NOT_OK(FromProto(pb_info.flight_descriptor(), &info->descriptor)); +arrow::Result FromProto(const pb::FlightInfo& pb_info) { + FlightInfo::Data info; + RETURN_NOT_OK(FromProto(pb_info.flight_descriptor(), &info.descriptor)); - info->schema = pb_info.schema(); + info.schema = pb_info.schema(); - info->endpoints.resize(pb_info.endpoint_size()); + info.endpoints.resize(pb_info.endpoint_size()); for (int i = 0; i < pb_info.endpoint_size(); ++i) { - RETURN_NOT_OK(FromProto(pb_info.endpoint(i), &info->endpoints[i])); + RETURN_NOT_OK(FromProto(pb_info.endpoint(i), &info.endpoints[i])); } - info->total_records = pb_info.total_records(); - info->total_bytes = pb_info.total_bytes(); - info->ordered = pb_info.ordered(); - return Status::OK(); + info.total_records = pb_info.total_records(); + info.total_bytes = pb_info.total_bytes(); + info.ordered = pb_info.ordered(); + return FlightInfo(std::move(info)); } Status FromProto(const pb::BasicAuth& pb_basic_auth, BasicAuth* basic_auth) { @@ -291,9 +292,8 @@ Status ToProto(const FlightInfo& info, pb::FlightInfo* pb_info) { Status FromProto(const pb::CancelFlightInfoRequest& pb_request, CancelFlightInfoRequest* request) { - FlightInfo::Data data; - RETURN_NOT_OK(FromProto(pb_request.info(), &data)); - request->info = std::make_unique(std::move(data)); + ARROW_ASSIGN_OR_RAISE(FlightInfo info, FromProto(pb_request.info())); + request->info = std::make_unique(std::move(info)); return Status::OK(); } diff --git a/cpp/src/arrow/flight/serialization_internal.h b/cpp/src/arrow/flight/serialization_internal.h index b0a3491ac261d..30eb0b31819b8 100644 --- a/cpp/src/arrow/flight/serialization_internal.h +++ b/cpp/src/arrow/flight/serialization_internal.h @@ -59,7 +59,7 @@ Status FromProto(const pb::FlightDescriptor& pb_descr, FlightDescriptor* descr); Status FromProto(const pb::FlightEndpoint& pb_endpoint, FlightEndpoint* endpoint); Status FromProto(const pb::RenewFlightEndpointRequest& pb_request, RenewFlightEndpointRequest* request); -Status FromProto(const pb::FlightInfo& pb_info, FlightInfo::Data* info); +arrow::Result FromProto(const pb::FlightInfo& pb_info); Status FromProto(const pb::CancelFlightInfoRequest& pb_request, CancelFlightInfoRequest* request); Status FromProto(const pb::SchemaResult& pb_result, std::string* result); diff --git a/cpp/src/arrow/flight/test_definitions.cc b/cpp/src/arrow/flight/test_definitions.cc index 4e137380044f3..55be3244fbde4 100644 --- a/cpp/src/arrow/flight/test_definitions.cc +++ b/cpp/src/arrow/flight/test_definitions.cc @@ -20,6 +20,7 @@ #include #include #include +#include #include "arrow/array/array_base.h" #include "arrow/array/array_dict.h" @@ -27,7 +28,11 @@ #include "arrow/flight/api.h" #include "arrow/flight/client_middleware.h" #include "arrow/flight/test_util.h" +#include "arrow/flight/types.h" +#include "arrow/flight/types_async.h" +#include "arrow/status.h" #include "arrow/table.h" +#include "arrow/testing/future_util.h" #include "arrow/testing/generator.h" #include "arrow/testing/gtest_util.h" #include "arrow/util/checked_cast.h" @@ -123,6 +128,28 @@ void ConnectivityTest::TestBrokenConnection() { //------------------------------------------------------------ // Tests of data plane methods +namespace { +class GetFlightInfoListener : public AsyncListener { + public: + void OnNext(FlightInfo message) override { + info = std::move(message); + counter++; + } + void OnFinish(Status status) override { + ASSERT_FALSE(future.is_finished()); + if (status.ok()) { + future.MarkFinished(std::move(info)); + } else { + future.MarkFinished(std::move(status)); + } + } + + FlightInfo info = FlightInfo(FlightInfo::Data{}); + int counter = 0; + arrow::Future future = arrow::Future::Make(); +}; +} // namespace + void DataTest::SetUpTest() { server_ = ExampleTestServer(); @@ -150,6 +177,14 @@ void DataTest::CheckDoGet( ASSERT_OK_AND_ASSIGN(auto info, client_->GetFlightInfo(descr)); check_endpoints(info->endpoints()); + if (supports_async()) { + auto listener = std::make_shared(); + client_->GetFlightInfoAsync(descr, listener); + ASSERT_FINISHES_OK(listener->future); + ASSERT_EQ(1, listener->counter); + check_endpoints(listener->future.MoveResult()->endpoints()); + } + ipc::DictionaryMemo dict_memo; ASSERT_OK_AND_ASSIGN(auto schema, info->GetSchema(&dict_memo)); AssertSchemaEqual(*expected_schema, *schema); @@ -671,11 +706,11 @@ void DoPutTest::SetUpTest() { void DoPutTest::TearDownTest() { ASSERT_OK(client_->Close()); ASSERT_OK(server_->Shutdown()); - reinterpret_cast(server_.get())->batches_.clear(); + checked_cast(server_.get())->batches_.clear(); } void DoPutTest::CheckBatches(const FlightDescriptor& expected_descriptor, const RecordBatchVector& expected_batches) { - auto* do_put_server = (DoPutTestServer*)server_.get(); + auto* do_put_server = static_cast(server_.get()); ASSERT_EQ(do_put_server->descriptor_, expected_descriptor); ASSERT_EQ(do_put_server->batches_.size(), expected_batches.size()); for (size_t i = 0; i < expected_batches.size(); ++i) { @@ -1410,6 +1445,26 @@ static const std::vector kStatusCodes = { StatusCode::AlreadyExists, }; +// For each Arrow status code, what Flight code do we get? +static const std::unordered_map kTransportStatusCodes = { + {StatusCode::OutOfMemory, TransportStatusCode::kUnknown}, + {StatusCode::KeyError, TransportStatusCode::kNotFound}, + {StatusCode::TypeError, TransportStatusCode::kUnknown}, + {StatusCode::Invalid, TransportStatusCode::kInvalidArgument}, + {StatusCode::IOError, TransportStatusCode::kUnknown}, + {StatusCode::CapacityError, TransportStatusCode::kUnknown}, + {StatusCode::IndexError, TransportStatusCode::kUnknown}, + {StatusCode::Cancelled, TransportStatusCode::kCancelled}, + {StatusCode::UnknownError, TransportStatusCode::kUnknown}, + {StatusCode::NotImplemented, TransportStatusCode::kUnimplemented}, + {StatusCode::SerializationError, TransportStatusCode::kUnknown}, + {StatusCode::RError, TransportStatusCode::kUnknown}, + {StatusCode::CodeGenError, TransportStatusCode::kUnknown}, + {StatusCode::ExpressionValidationError, TransportStatusCode::kUnknown}, + {StatusCode::ExecutionError, TransportStatusCode::kUnknown}, + {StatusCode::AlreadyExists, TransportStatusCode::kAlreadyExists}, +}; + static const std::vector kFlightStatusCodes = { FlightStatusCode::Internal, FlightStatusCode::TimedOut, FlightStatusCode::Cancelled, FlightStatusCode::Unauthenticated, @@ -1517,6 +1572,15 @@ class MetadataRecordingClientMiddlewareFactory : public ClientMiddlewareFactory mutable std::mutex mutex_; std::vector> headers_; }; + +class TransportStatusListener : public AsyncListener { + public: + void OnNext(FlightInfo /*message*/) override {} + void OnFinish(Status status) override { future.MarkFinished(std::move(status)); } + + arrow::Future<> future = arrow::Future<>::Make(); +}; + } // namespace struct ErrorHandlingTest::Impl { @@ -1544,6 +1608,98 @@ std::vector> ErrorHandlingTest::GetHeaders() return impl_->metadata->GetHeaders(); } +void ErrorHandlingTest::TestAsyncGetFlightInfo() { + if (!supports_async()) { + GTEST_SKIP() << "Transport does not support async"; + } + // Server-side still does all the junk around trying to translate Arrow + // status codes, so this test is a little indirect + + for (const auto code : kStatusCodes) { + ARROW_SCOPED_TRACE("C++ status code: ", static_cast(code), ": ", + Status::CodeAsString(code)); + + // Just the status code + { + auto descr = FlightDescriptor::Path( + {std::to_string(static_cast(code)), "Expected message"}); + auto listener = std::make_shared(); + + client_->GetFlightInfoAsync(descr, listener); + EXPECT_FINISHES(listener->future); + auto detail = TransportStatusDetail::Unwrap(listener->future.status()); + ASSERT_TRUE(detail.has_value()); + + EXPECT_EQ(detail->get().code(), kTransportStatusCodes.at(code)); + // Exact equality - should have no extra junk in the message + EXPECT_EQ(detail->get().message(), "Expected message"); + } + + // Custom status detail + { + auto descr = FlightDescriptor::Path( + {std::to_string(static_cast(code)), "Expected message", ""}); + auto listener = std::make_shared(); + + client_->GetFlightInfoAsync(descr, listener); + EXPECT_FINISHES(listener->future); + auto detail = TransportStatusDetail::Unwrap(listener->future.status()); + ASSERT_TRUE(detail.has_value()); + + EXPECT_EQ(detail->get().code(), kTransportStatusCodes.at(code)); + // The server-side arrow::Status-to-TransportStatus conversion puts the + // detail into the main error message. + EXPECT_EQ(detail->get().message(), + "Expected message. Detail: Custom status detail"); + + std::string_view arrow_code, arrow_message; + for (const auto& [key, value] : detail->get().details()) { + if (key == "x-arrow-status") { + arrow_code = value; + } else if (key == "x-arrow-status-message-bin") { + arrow_message = value; + } + } + EXPECT_EQ(arrow_code, std::to_string(static_cast(code))); + EXPECT_EQ(arrow_message, "Expected message"); + } + + // Flight status detail + for (const auto flight_code : kFlightStatusCodes) { + ARROW_SCOPED_TRACE("Flight status code: ", static_cast(flight_code)); + auto descr = FlightDescriptor::Path( + {std::to_string(static_cast(code)), "Expected message", + std::to_string(static_cast(flight_code)), "Expected detail message"}); + auto listener = std::make_shared(); + + client_->GetFlightInfoAsync(descr, listener); + EXPECT_FINISHES(listener->future); + auto detail = TransportStatusDetail::Unwrap(listener->future.status()); + ASSERT_TRUE(detail.has_value()); + + // The server-side arrow::Status-to-TransportStatus conversion puts the + // detail into the main error message. + EXPECT_THAT(detail->get().message(), + ::testing::HasSubstr("Expected message. Detail:")); + + std::string_view arrow_code, arrow_message, binary_detail; + for (const auto& [key, value] : detail->get().details()) { + if (key == "x-arrow-status") { + arrow_code = value; + } else if (key == "x-arrow-status-message-bin") { + arrow_message = value; + } else if (key == "grpc-status-details-bin") { + binary_detail = value; + } + } + + EXPECT_EQ(arrow_code, std::to_string(static_cast(code))); + EXPECT_EQ(arrow_message, "Expected message"); + EXPECT_EQ(binary_detail, "Expected detail message"); + } + } +} + void ErrorHandlingTest::TestGetFlightInfo() { std::unique_ptr info; for (const auto code : kStatusCodes) { @@ -1656,5 +1812,112 @@ void ErrorHandlingTest::TestDoExchange() { reader_thread.join(); } +//------------------------------------------------------------ +// Test async clients + +void AsyncClientTest::SetUpTest() { + if (!supports_async()) { + GTEST_SKIP() << "async is not supported"; + } + + ASSERT_OK_AND_ASSIGN(auto location, Location::ForScheme(transport(), "127.0.0.1", 0)); + + server_ = ExampleTestServer(); + FlightServerOptions server_options(location); + ASSERT_OK(server_->Init(server_options)); + + std::string uri = location.scheme() + "://127.0.0.1:" + std::to_string(server_->port()); + ASSERT_OK_AND_ASSIGN(auto real_location, Location::Parse(uri)); + FlightClientOptions client_options = FlightClientOptions::Defaults(); + ASSERT_OK_AND_ASSIGN(client_, FlightClient::Connect(real_location, client_options)); + + ASSERT_TRUE(client_->supports_async()); +} +void AsyncClientTest::TearDownTest() { + if (supports_async()) { + ASSERT_OK(client_->Close()); + ASSERT_OK(server_->Shutdown()); + } +} + +void AsyncClientTest::TestGetFlightInfo() { + class Listener : public AsyncListener { + public: + void OnNext(FlightInfo info) override { + info_ = std::move(info); + counter_++; + } + + void OnFinish(Status status) override { + ASSERT_FALSE(future_.is_finished()); + if (status.ok()) { + future_.MarkFinished(std::move(info_)); + } else { + future_.MarkFinished(std::move(status)); + } + } + + int counter_ = 0; + FlightInfo info_ = FlightInfo(FlightInfo::Data()); + arrow::Future future_ = arrow::Future::Make(); + }; + + auto descr = FlightDescriptor::Command("status-outofmemory"); + auto listener = std::make_shared(); + client_->GetFlightInfoAsync(descr, listener); + + ASSERT_FINISHES_AND_RAISES(UnknownError, listener->future_); + ASSERT_THAT(listener->future_.status().ToString(), ::testing::HasSubstr("Sentinel")); + ASSERT_EQ(0, listener->counter_); +} + +void AsyncClientTest::TestGetFlightInfoFuture() { + auto descr = FlightDescriptor::Command("status-outofmemory"); + auto future = client_->GetFlightInfoAsync(descr); + ASSERT_FINISHES_AND_RAISES(UnknownError, future); + ASSERT_THAT(future.status().ToString(), ::testing::HasSubstr("Sentinel")); + + descr = FlightDescriptor::Command("my_command"); + future = client_->GetFlightInfoAsync(descr); + ASSERT_FINISHES_OK_AND_ASSIGN(auto info, future); + // See test_util.cc:ExampleFlightInfo + ASSERT_EQ(descr, info.descriptor()); + ASSERT_EQ(1000, info.total_records()); + ASSERT_EQ(100000, info.total_bytes()); +} + +void AsyncClientTest::TestListenerLifetime() { + arrow::Future future = arrow::Future::Make(); + + class Listener : public AsyncListener { + public: + void OnNext(FlightInfo info) override { info_ = std::move(info); } + + void OnFinish(Status status) override { + if (status.ok()) { + future_.MarkFinished(std::move(info_)); + } else { + future_.MarkFinished(std::move(status)); + } + } + + FlightInfo info_ = FlightInfo(FlightInfo::Data()); + arrow::Future future_; + }; + + // Bad client code: don't retain a reference to the listener, which owns the + // RPC state. We should still be able to get the result without crashing. (The + // RPC state is disposed of in the background via the 'garbage bin' in the + // gRPC client implementation.) + { + auto descr = FlightDescriptor::Command("my_command"); + auto listener = std::make_shared(); + listener->future_ = future; + client_->GetFlightInfoAsync(descr, std::move(listener)); + } + + ASSERT_FINISHES_OK(future); +} + } // namespace flight } // namespace arrow diff --git a/cpp/src/arrow/flight/test_definitions.h b/cpp/src/arrow/flight/test_definitions.h index c73bc264b4966..1e0e8c209ac94 100644 --- a/cpp/src/arrow/flight/test_definitions.h +++ b/cpp/src/arrow/flight/test_definitions.h @@ -40,6 +40,7 @@ namespace flight { class ARROW_FLIGHT_EXPORT FlightTest { protected: virtual std::string transport() const = 0; + virtual bool supports_async() const { return false; } virtual void SetUpTest() {} virtual void TearDownTest() {} }; @@ -266,6 +267,7 @@ class ARROW_FLIGHT_EXPORT ErrorHandlingTest : public FlightTest { // Test methods void TestGetFlightInfo(); void TestGetFlightInfoMetadata(); + void TestAsyncGetFlightInfo(); void TestDoPut(); void TestDoExchange(); @@ -282,10 +284,34 @@ class ARROW_FLIGHT_EXPORT ErrorHandlingTest : public FlightTest { #define ARROW_FLIGHT_TEST_ERROR_HANDLING(FIXTURE) \ static_assert(std::is_base_of::value, \ ARROW_STRINGIFY(FIXTURE) " must inherit from ErrorHandlingTest"); \ + TEST_F(FIXTURE, TestAsyncGetFlightInfo) { TestAsyncGetFlightInfo(); } \ TEST_F(FIXTURE, TestGetFlightInfo) { TestGetFlightInfo(); } \ TEST_F(FIXTURE, TestGetFlightInfoMetadata) { TestGetFlightInfoMetadata(); } \ TEST_F(FIXTURE, TestDoPut) { TestDoPut(); } \ TEST_F(FIXTURE, TestDoExchange) { TestDoExchange(); } +/// \brief Tests of the async client. +class ARROW_FLIGHT_EXPORT AsyncClientTest : public FlightTest { + public: + void SetUpTest() override; + void TearDownTest() override; + + // Test methods + void TestGetFlightInfo(); + void TestGetFlightInfoFuture(); + void TestListenerLifetime(); + + private: + std::unique_ptr client_; + std::unique_ptr server_; +}; + +#define ARROW_FLIGHT_TEST_ASYNC_CLIENT(FIXTURE) \ + static_assert(std::is_base_of::value, \ + ARROW_STRINGIFY(FIXTURE) " must inherit from AsyncClientTest"); \ + TEST_F(FIXTURE, TestGetFlightInfo) { TestGetFlightInfo(); } \ + TEST_F(FIXTURE, TestGetFlightInfoFuture) { TestGetFlightInfoFuture(); } \ + TEST_F(FIXTURE, TestListenerLifetime) { TestListenerLifetime(); } + } // namespace flight } // namespace arrow diff --git a/cpp/src/arrow/flight/transport.cc b/cpp/src/arrow/flight/transport.cc index a0281ffd61e60..88228f2503597 100644 --- a/cpp/src/arrow/flight/transport.cc +++ b/cpp/src/arrow/flight/transport.cc @@ -24,6 +24,7 @@ #include "arrow/flight/client_auth.h" #include "arrow/flight/transport_server.h" #include "arrow/flight/types.h" +#include "arrow/flight/types_async.h" #include "arrow/ipc/message.h" #include "arrow/result.h" #include "arrow/status.h" @@ -74,6 +75,11 @@ Status ClientTransport::GetFlightInfo(const FlightCallOptions& options, std::unique_ptr* info) { return Status::NotImplemented("GetFlightInfo for this transport"); } +void ClientTransport::GetFlightInfoAsync( + const FlightCallOptions& options, const FlightDescriptor& descriptor, + std::shared_ptr> listener) { + listener->OnFinish(Status::NotImplemented("Async GetFlightInfo for this transport")); +} arrow::Result> ClientTransport::GetSchema( const FlightCallOptions& options, const FlightDescriptor& descriptor) { return Status::NotImplemented("GetSchema for this transport"); @@ -95,6 +101,16 @@ Status ClientTransport::DoExchange(const FlightCallOptions& options, std::unique_ptr* stream) { return Status::NotImplemented("DoExchange for this transport"); } +void ClientTransport::SetAsyncRpc(AsyncListenerBase* listener, + std::unique_ptr&& rpc) { + listener->rpc_state_ = std::move(rpc); +} +AsyncRpc* ClientTransport::GetAsyncRpc(AsyncListenerBase* listener) { + return listener->rpc_state_.get(); +} +std::unique_ptr ClientTransport::ReleaseAsyncRpc(AsyncListenerBase* listener) { + return std::move(listener->rpc_state_); +} class TransportRegistry::Impl final { public: diff --git a/cpp/src/arrow/flight/transport.h b/cpp/src/arrow/flight/transport.h index 6406734e6e7e7..69605d2112f1f 100644 --- a/cpp/src/arrow/flight/transport.h +++ b/cpp/src/arrow/flight/transport.h @@ -64,7 +64,9 @@ #include #include "arrow/flight/type_fwd.h" +#include "arrow/flight/types.h" #include "arrow/flight/visibility.h" +#include "arrow/ipc/options.h" #include "arrow/type_fwd.h" namespace arrow { @@ -182,6 +184,9 @@ class ARROW_FLIGHT_EXPORT ClientTransport { virtual Status GetFlightInfo(const FlightCallOptions& options, const FlightDescriptor& descriptor, std::unique_ptr* info); + virtual void GetFlightInfoAsync(const FlightCallOptions& options, + const FlightDescriptor& descriptor, + std::shared_ptr> listener); virtual arrow::Result> GetSchema( const FlightCallOptions& options, const FlightDescriptor& descriptor); virtual Status ListFlights(const FlightCallOptions& options, const Criteria& criteria, @@ -192,6 +197,12 @@ class ARROW_FLIGHT_EXPORT ClientTransport { std::unique_ptr* stream); virtual Status DoExchange(const FlightCallOptions& options, std::unique_ptr* stream); + + virtual bool supports_async() const { return false; } + + static void SetAsyncRpc(AsyncListenerBase* listener, std::unique_ptr&& rpc); + static AsyncRpc* GetAsyncRpc(AsyncListenerBase* listener); + static std::unique_ptr ReleaseAsyncRpc(AsyncListenerBase* listener); }; /// A registry of transport implementations. @@ -223,24 +234,33 @@ ARROW_FLIGHT_EXPORT TransportRegistry* GetDefaultTransportRegistry(); //------------------------------------------------------------ -// Error propagation helpers +// Async APIs -/// \brief Abstract status code as per the Flight specification. -enum class TransportStatusCode { - kOk = 0, - kUnknown = 1, - kInternal = 2, - kInvalidArgument = 3, - kTimedOut = 4, - kNotFound = 5, - kAlreadyExists = 6, - kCancelled = 7, - kUnauthenticated = 8, - kUnauthorized = 9, - kUnimplemented = 10, - kUnavailable = 11, +/// \brief Transport-specific state for an async RPC. +/// +/// Transport implementations may subclass this to store their own +/// state, and stash an instance in a user-supplied AsyncListener via +/// ClientTransport::GetAsyncRpc and ClientTransport::SetAsyncRpc. +/// +/// This API is EXPERIMENTAL. +class ARROW_FLIGHT_EXPORT AsyncRpc { + public: + virtual ~AsyncRpc() = default; + /// \brief Request cancellation of the RPC. + virtual void TryCancel() {} + + /// Only needed for DoPut/DoExchange + virtual void Begin(const FlightDescriptor& descriptor, std::shared_ptr schema) { + } + /// Only needed for DoPut/DoExchange + virtual void Write(arrow::flight::FlightStreamChunk chunk) {} + /// Only needed for DoPut/DoExchange + virtual void DoneWriting() {} }; +//------------------------------------------------------------ +// Error propagation helpers + /// \brief Abstract error status. /// /// Transport implementations may use side channels (e.g. HTTP diff --git a/cpp/src/arrow/flight/transport/grpc/grpc_client.cc b/cpp/src/arrow/flight/transport/grpc/grpc_client.cc index 9b40015f9f729..7108f35549998 100644 --- a/cpp/src/arrow/flight/transport/grpc/grpc_client.cc +++ b/cpp/src/arrow/flight/transport/grpc/grpc_client.cc @@ -17,15 +17,19 @@ #include "arrow/flight/transport/grpc/grpc_client.h" +#include +#include #include #include #include #include #include +#include #include #include #include +#include #if defined(GRPC_NAMESPACE_FOR_TLS_CREDENTIALS_OPTIONS) #include #endif @@ -51,6 +55,7 @@ #include "arrow/flight/transport/grpc/serialization_internal.h" #include "arrow/flight/transport/grpc/util_internal.h" #include "arrow/flight/types.h" +#include "arrow/flight/types_async.h" namespace arrow { @@ -549,6 +554,127 @@ class GrpcResultStream : public ResultStream { std::unique_ptr<::grpc::ClientReader> stream_; }; +#ifdef GRPC_ENABLE_ASYNC +/// Force destruction to wait for RPC completion. +class FinishedFlag { + public: + ~FinishedFlag() { Wait(); } + + void Finish() { + std::lock_guard guard(mutex_); + finished_ = true; + cv_.notify_all(); + } + void Wait() const { + std::unique_lock guard(mutex_); + cv_.wait(guard, [&]() { return finished_; }); + } + + private: + mutable std::mutex mutex_; + mutable std::condition_variable cv_; + bool finished_{false}; +}; + +// XXX: it appears that if we destruct gRPC resources (like a +// ClientContext) from a gRPC callback, we will be running on a gRPC +// thread and we may attempt to join ourselves (because gRPC +// apparently refcounts threads). Avoid that by transferring gRPC +// resources to a dedicated thread for destruction. +class GrpcGarbageBin { + public: + GrpcGarbageBin() { + grpc_destructor_thread_ = std::thread([&]() { + while (true) { + std::unique_lock guard(grpc_destructor_mutex_); + grpc_destructor_cv_.wait(guard, + [&]() { return !running_ || !garbage_bin_.empty(); }); + + garbage_bin_.clear(); + + if (!running_) return; + } + }); + } + + void Dispose(std::unique_ptr trash) { + std::unique_lock guard(grpc_destructor_mutex_); + if (!running_) return; + garbage_bin_.push_back(std::move(trash)); + grpc_destructor_cv_.notify_all(); + } + + void Stop() { + { + std::unique_lock guard(grpc_destructor_mutex_); + running_ = false; + grpc_destructor_cv_.notify_all(); + } + grpc_destructor_thread_.join(); + } + + private: + bool running_ = true; + std::thread grpc_destructor_thread_; + std::mutex grpc_destructor_mutex_; + std::condition_variable grpc_destructor_cv_; + std::deque> garbage_bin_; +}; + +template +class UnaryUnaryAsyncCall : public ::grpc::ClientUnaryReactor, public internal::AsyncRpc { + public: + ClientRpc rpc; + std::shared_ptr> listener; + std::shared_ptr garbage_bin_; + + Request pb_request; + Response pb_response; + Status client_status; + + // Destruct last + FinishedFlag finished; + + explicit UnaryUnaryAsyncCall(const FlightCallOptions& options, + std::shared_ptr> listener, + std::shared_ptr garbage_bin) + : rpc(options), + listener(std::move(listener)), + garbage_bin_(std::move(garbage_bin)) {} + + void TryCancel() override { rpc.context.TryCancel(); } + + void OnDone(const ::grpc::Status& status) override { + if (status.ok()) { + auto result = internal::FromProto(pb_response); + client_status = result.status(); + if (client_status.ok()) { + listener->OnNext(std::move(result).MoveValueUnsafe()); + } + } + Finish(status); + } + + void Finish(const ::grpc::Status& status) { + auto listener = std::move(this->listener); + listener->OnFinish( + CombinedTransportStatus(status, std::move(client_status), &rpc.context)); + // SetAsyncRpc may trigger destruction, so Finish() first + finished.Finish(); + // Instead of potentially destructing gRPC resources here, + // transfer it to a dedicated background thread + garbage_bin_->Dispose( + flight::internal::ClientTransport::ReleaseAsyncRpc(listener.get())); + } +}; + +#define LISTENER_NOT_OK(LISTENER, EXPR) \ + if (auto arrow_status = (EXPR); !arrow_status.ok()) { \ + (LISTENER)->OnFinish(std::move(arrow_status)); \ + return; \ + } +#endif + class GrpcClientImpl : public internal::ClientTransport { public: static arrow::Result> Make() { @@ -702,14 +828,30 @@ class GrpcClientImpl : public internal::ClientTransport { stub_ = pb::FlightService::NewStub( ::grpc::experimental::CreateCustomChannelWithInterceptors( grpc_uri.str(), creds, args, std::move(interceptors))); + +#ifdef GRPC_ENABLE_ASYNC + garbage_bin_ = std::make_shared(); +#endif + return Status::OK(); } Status Close() override { - // TODO(ARROW-15473): if we track ongoing RPCs, we can cancel them first - // gRPC does not offer a real Close(). We could reset() the gRPC - // client but that can cause gRPC to hang in shutdown - // (ARROW-15793). +#ifdef GRPC_ENABLE_ASYNC + // TODO(https://github.com/apache/arrow/issues/30949): if there are async + // RPCs running when the client is stopped, then when they go to use the + // garbage bin, they'll instead synchronously dispose of resources from + // the callback thread, and will likely crash. We could instead cancel + // them first and wait for completion before stopping the thread, but + // tracking all of the RPCs may be unacceptable overhead for clients that + // are making many small concurrent RPC calls, so it remains to be seen + // whether there's a pressing need for this. + garbage_bin_->Stop(); +#endif + // TODO(https://github.com/apache/arrow/issues/30949): if we track ongoing + // RPCs, we can cancel them first gRPC does not offer a real Close(). We + // could reset() the gRPC client but that can cause gRPC to hang in + // shutdown (https://github.com/apache/arrow/issues/31235). return Status::OK(); } @@ -745,8 +887,7 @@ class GrpcClientImpl : public internal::ClientTransport { pb::FlightInfo pb_info; while (!options.stop_token.IsStopRequested() && stream->Read(&pb_info)) { - FlightInfo::Data info_data; - RETURN_NOT_OK(internal::FromProto(pb_info, &info_data)); + ARROW_ASSIGN_OR_RAISE(FlightInfo info_data, internal::FromProto(pb_info)); flights.emplace_back(std::move(info_data)); } if (options.stop_token.IsStopRequested()) rpc.context.TryCancel(); @@ -796,9 +937,8 @@ class GrpcClientImpl : public internal::ClientTransport { stub_->GetFlightInfo(&rpc.context, pb_descriptor, &pb_response), &rpc.context); RETURN_NOT_OK(s); - FlightInfo::Data info_data; - RETURN_NOT_OK(internal::FromProto(pb_response, &info_data)); - info->reset(new FlightInfo(std::move(info_data))); + ARROW_ASSIGN_OR_RAISE(auto info_data, internal::FromProto(pb_response)); + *info = std::make_unique(std::move(info_data)); return Status::OK(); } @@ -855,6 +995,36 @@ class GrpcClientImpl : public internal::ClientTransport { return Status::OK(); } +#ifdef GRPC_ENABLE_ASYNC + void GetFlightInfoAsync(const FlightCallOptions& options, + const FlightDescriptor& descriptor, + std::shared_ptr> listener) override { + using AsyncCall = + UnaryUnaryAsyncCall; + auto call = std::make_unique(options, listener, garbage_bin_); + LISTENER_NOT_OK(listener, internal::ToProto(descriptor, &call->pb_request)); + LISTENER_NOT_OK(listener, call->rpc.SetToken(auth_handler_.get())); + + stub_->experimental_async()->GetFlightInfo(&call->rpc.context, &call->pb_request, + &call->pb_response, call.get()); + ClientTransport::SetAsyncRpc(listener.get(), std::move(call)); + arrow::internal::checked_cast( + ClientTransport::GetAsyncRpc(listener.get())) + ->StartCall(); + } + + bool supports_async() const override { return true; } +#else + void GetFlightInfoAsync(const FlightCallOptions& options, + const FlightDescriptor& descriptor, + std::shared_ptr> listener) override { + listener->OnFinish( + Status::NotImplemented("gRPC 1.40 or newer is required to use async")); + } + + bool supports_async() const override { return false; } +#endif + private: Status AuthenticateInternal(ClientRpc& rpc) { std::shared_ptr< @@ -894,6 +1064,10 @@ class GrpcClientImpl : public internal::ClientTransport { ::GRPC_NAMESPACE_FOR_TLS_CREDENTIALS_OPTIONS::TlsServerAuthorizationCheckConfig> noop_auth_check_; #endif + +#ifdef GRPC_ENABLE_ASYNC + std::shared_ptr garbage_bin_; +#endif }; std::once_flag kGrpcClientTransportInitialized; } // namespace @@ -907,6 +1081,8 @@ void InitializeFlightGrpcClient() { }); } +#undef LISTENER_NOT_OK + } // namespace grpc } // namespace transport } // namespace flight diff --git a/cpp/src/arrow/flight/transport/grpc/util_internal.cc b/cpp/src/arrow/flight/transport/grpc/util_internal.cc index f431fc30ec87a..88ec15bc66e08 100644 --- a/cpp/src/arrow/flight/transport/grpc/util_internal.cc +++ b/cpp/src/arrow/flight/transport/grpc/util_internal.cc @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -28,6 +29,7 @@ #include "arrow/flight/types.h" #include "arrow/status.h" #include "arrow/util/string.h" +#include "arrow/util/string_builder.h" namespace arrow { @@ -37,6 +39,8 @@ namespace flight { namespace transport { namespace grpc { +using internal::TransportStatus; + const char* kGrpcAuthHeader = "auth-token-bin"; const char* kGrpcStatusCodeHeader = "x-arrow-status"; const char* kGrpcStatusMessageHeader = "x-arrow-status-message-bin"; @@ -82,11 +86,106 @@ static bool FromGrpcContext(const ::grpc::ClientContext& ctx, return true; } +static TransportStatus TransportStatusFromGrpc(const ::grpc::Status& grpc_status) { + switch (grpc_status.error_code()) { + case ::grpc::StatusCode::OK: + return TransportStatus{TransportStatusCode::kOk, ""}; + case ::grpc::StatusCode::CANCELLED: + return TransportStatus{TransportStatusCode::kCancelled, + grpc_status.error_message()}; + case ::grpc::StatusCode::UNKNOWN: + return TransportStatus{TransportStatusCode::kUnknown, grpc_status.error_message()}; + case ::grpc::StatusCode::INVALID_ARGUMENT: + return TransportStatus{TransportStatusCode::kInvalidArgument, + grpc_status.error_message()}; + case ::grpc::StatusCode::DEADLINE_EXCEEDED: + return TransportStatus{TransportStatusCode::kTimedOut, grpc_status.error_message()}; + case ::grpc::StatusCode::NOT_FOUND: + return TransportStatus{TransportStatusCode::kNotFound, grpc_status.error_message()}; + case ::grpc::StatusCode::ALREADY_EXISTS: + return TransportStatus{TransportStatusCode::kAlreadyExists, + grpc_status.error_message()}; + case ::grpc::StatusCode::PERMISSION_DENIED: + return TransportStatus{TransportStatusCode::kUnauthorized, + grpc_status.error_message()}; + case ::grpc::StatusCode::RESOURCE_EXHAUSTED: + return TransportStatus{TransportStatusCode::kUnavailable, + grpc_status.error_message()}; + case ::grpc::StatusCode::FAILED_PRECONDITION: + return TransportStatus{TransportStatusCode::kUnavailable, + grpc_status.error_message()}; + case ::grpc::StatusCode::ABORTED: + return TransportStatus{TransportStatusCode::kUnavailable, + grpc_status.error_message()}; + case ::grpc::StatusCode::OUT_OF_RANGE: + return TransportStatus{TransportStatusCode::kInvalidArgument, + grpc_status.error_message()}; + case ::grpc::StatusCode::UNIMPLEMENTED: + return TransportStatus{TransportStatusCode::kUnimplemented, + grpc_status.error_message()}; + case ::grpc::StatusCode::INTERNAL: + return TransportStatus{TransportStatusCode::kInternal, grpc_status.error_message()}; + case ::grpc::StatusCode::UNAVAILABLE: + return TransportStatus{TransportStatusCode::kUnavailable, + grpc_status.error_message()}; + case ::grpc::StatusCode::DATA_LOSS: + return TransportStatus{TransportStatusCode::kInternal, grpc_status.error_message()}; + case ::grpc::StatusCode::UNAUTHENTICATED: + return TransportStatus{TransportStatusCode::kUnauthenticated, + grpc_status.error_message()}; + default: + return TransportStatus{TransportStatusCode::kUnknown, + util::StringBuilder("(", grpc_status.error_code(), ")", + grpc_status.error_message())}; + } +} + +Status CombinedTransportStatus(const ::grpc::Status& grpc_status, + arrow::Status arrow_status, ::grpc::ClientContext* ctx) { + if (grpc_status.ok() && arrow_status.ok()) { + return Status::OK(); + } else if (grpc_status.ok() && !arrow_status.ok()) { + return arrow_status; + } + + // Can't share with FromGrpcCode because that function sometimes constructs an Arrow + // Status directly + const TransportStatus base_status = TransportStatusFromGrpc(grpc_status); + + std::vector> details; + if (!grpc_status.ok() && ctx) { + // Attach rich error details + const std::multimap<::grpc::string_ref, ::grpc::string_ref>& trailers = + ctx->GetServerTrailingMetadata(); + + for (const auto key : { + // gRPC error details + kBinaryErrorDetailsKey, + // Sync C++ servers send information about the Arrow status + kGrpcStatusCodeHeader, + kGrpcStatusMessageHeader, + kGrpcStatusDetailHeader, + }) { + for (auto [it, end] = trailers.equal_range(key); it != end; it++) { + details.emplace_back(key, std::string(it->second.data(), it->second.size())); + } + } + } + + if (arrow_status.ok()) { + arrow_status = base_status.ToStatus(); + } + + if (!details.empty()) { + return arrow_status.WithDetail(std::make_shared( + base_status.code, std::move(base_status.message), std::move(details))); + } + return arrow_status; +} + /// Convert a gRPC status to an Arrow status, ignoring any /// implementation-defined headers that encode further detail. static Status FromGrpcCode(const ::grpc::Status& grpc_status) { - using internal::TransportStatus; - using internal::TransportStatusCode; switch (grpc_status.error_code()) { case ::grpc::StatusCode::OK: return Status::OK(); @@ -169,8 +268,6 @@ Status FromGrpcStatus(const ::grpc::Status& grpc_status, ::grpc::ClientContext* /// Convert an Arrow status to a gRPC status. static ::grpc::Status ToRawGrpcStatus(const Status& arrow_status) { - using internal::TransportStatus; - using internal::TransportStatusCode; if (arrow_status.ok()) return ::grpc::Status::OK; TransportStatus transport_status = TransportStatus::FromStatus(arrow_status); @@ -215,7 +312,7 @@ static ::grpc::Status ToRawGrpcStatus(const Status& arrow_status) { grpc_code = ::grpc::StatusCode::UNKNOWN; break; } - return ::grpc::Status(grpc_code, std::move(transport_status.message)); + return {grpc_code, std::move(transport_status.message)}; } /// Convert an Arrow status to a gRPC status, and add extra headers to diff --git a/cpp/src/arrow/flight/transport/grpc/util_internal.h b/cpp/src/arrow/flight/transport/grpc/util_internal.h index a267e55654467..5687c7a872a4d 100644 --- a/cpp/src/arrow/flight/transport/grpc/util_internal.h +++ b/cpp/src/arrow/flight/transport/grpc/util_internal.h @@ -18,6 +18,7 @@ #pragma once #include "arrow/flight/transport/grpc/protocol_grpc_internal.h" +#include "arrow/flight/types.h" #include "arrow/flight/visibility.h" #include "arrow/util/macros.h" @@ -71,6 +72,13 @@ extern const char* kGrpcStatusDetailHeader; ARROW_FLIGHT_EXPORT extern const char* kBinaryErrorDetailsKey; +/// \brief Combine a gRPC status, possible client-side Arrow status, +/// and a gRPC ClientContext into a transport status. +ARROW_FLIGHT_EXPORT +Status CombinedTransportStatus(const ::grpc::Status& grpc_status, + arrow::Status arrow_status, + ::grpc::ClientContext* ctx = nullptr); + /// Convert a gRPC status to an Arrow status. Optionally, provide a /// ClientContext to recover the exact Arrow status if it was passed /// over the wire. diff --git a/cpp/src/arrow/flight/type_fwd.h b/cpp/src/arrow/flight/type_fwd.h index c82c4e6d8f533..ac2effbc91d96 100644 --- a/cpp/src/arrow/flight/type_fwd.h +++ b/cpp/src/arrow/flight/type_fwd.h @@ -24,6 +24,10 @@ class Uri; namespace flight { struct Action; struct ActionType; +template +class AsyncListener; +class AsyncListenerBase; +class AsyncRpc; struct BasicAuth; class ClientAuthHandler; class ClientMiddleware; @@ -51,6 +55,7 @@ class ServerMiddleware; class ServerMiddlewareFactory; struct Ticket; namespace internal { +class AsyncRpc; class ClientTransport; struct FlightData; class ServerTransport; diff --git a/cpp/src/arrow/flight/types.cc b/cpp/src/arrow/flight/types.cc index 7c72595ed624b..b7cd55325b1f4 100644 --- a/cpp/src/arrow/flight/types.cc +++ b/cpp/src/arrow/flight/types.cc @@ -24,12 +24,16 @@ #include "arrow/buffer.h" #include "arrow/flight/serialization_internal.h" +#include "arrow/flight/types_async.h" #include "arrow/io/memory.h" #include "arrow/ipc/dictionary.h" #include "arrow/ipc/reader.h" #include "arrow/status.h" #include "arrow/table.h" +#include "arrow/util/base64.h" #include "arrow/util/formatting.h" +#include "arrow/util/logging.h" +#include "arrow/util/string.h" #include "arrow/util/string_builder.h" #include "arrow/util/uri.h" @@ -299,9 +303,8 @@ arrow::Result> FlightInfo::Deserialize( if (!pb_info.ParseFromZeroCopyStream(&input)) { return Status::Invalid("Not a valid FlightInfo"); } - FlightInfo::Data data; - RETURN_NOT_OK(internal::FromProto(pb_info, &data)); - return std::make_unique(std::move(data)); + ARROW_ASSIGN_OR_RAISE(FlightInfo info, internal::FromProto(pb_info)); + return std::make_unique(std::move(info)); } std::string FlightInfo::ToString() const { @@ -873,5 +876,88 @@ arrow::Result BasicAuth::SerializeToString() const { return out; } +//------------------------------------------------------------ +// Error propagation helpers + +std::string ToString(TransportStatusCode code) { + switch (code) { + case TransportStatusCode::kOk: + return "kOk"; + case TransportStatusCode::kUnknown: + return "kUnknown"; + case TransportStatusCode::kInternal: + return "kInternal"; + case TransportStatusCode::kInvalidArgument: + return "kInvalidArgument"; + case TransportStatusCode::kTimedOut: + return "kTimedOut"; + case TransportStatusCode::kNotFound: + return "kNotFound"; + case TransportStatusCode::kAlreadyExists: + return "kAlreadyExists"; + case TransportStatusCode::kCancelled: + return "kCancelled"; + case TransportStatusCode::kUnauthenticated: + return "kUnauthenticated"; + case TransportStatusCode::kUnauthorized: + return "kUnauthorized"; + case TransportStatusCode::kUnimplemented: + return "kUnimplemented"; + case TransportStatusCode::kUnavailable: + return "kUnavailable"; + } + return "(unknown code)"; +} + +std::string TransportStatusDetail::ToString() const { + std::string repr = "TransportStatusDetail{"; + repr += arrow::flight::ToString(code()); + repr += ", message=\""; + repr += message(); + repr += "\", details={"; + + bool first = true; + for (const auto& [key, value] : details()) { + if (!first) { + repr += ", "; + } + first = false; + + repr += "{\""; + repr += key; + repr += "\", "; + if (arrow::internal::EndsWith(key, "-bin")) { + repr += arrow::util::base64_encode(value); + } else { + repr += "\""; + repr += value; + repr += "\""; + } + repr += "}"; + } + + repr += "}}"; + return repr; +} + +std::optional> +TransportStatusDetail::Unwrap(const Status& status) { + std::shared_ptr detail = status.detail(); + if (!detail) return std::nullopt; + if (detail->type_id() != kTypeId) return std::nullopt; + return std::cref(arrow::internal::checked_cast(*detail)); +} + +//------------------------------------------------------------ +// Async types + +AsyncListenerBase::AsyncListenerBase() = default; +AsyncListenerBase::~AsyncListenerBase() = default; +void AsyncListenerBase::TryCancel() { + if (rpc_state_) { + rpc_state_->TryCancel(); + } +} + } // namespace flight } // namespace arrow diff --git a/cpp/src/arrow/flight/types.h b/cpp/src/arrow/flight/types.h index ca86c27e86976..c5d72d5167271 100644 --- a/cpp/src/arrow/flight/types.h +++ b/cpp/src/arrow/flight/types.h @@ -24,15 +24,18 @@ #include #include #include +#include #include #include #include #include +#include "arrow/flight/type_fwd.h" #include "arrow/flight/visibility.h" #include "arrow/ipc/options.h" #include "arrow/ipc/writer.h" #include "arrow/result.h" +#include "arrow/status.h" namespace arrow { @@ -71,7 +74,8 @@ namespace flight { /// > is from 0001-01-01T00:00:00Z to 9999-12-31T23:59:59.999999999Z. using Timestamp = std::chrono::system_clock::time_point; -/// \brief A Flight-specific status code. +/// \brief A Flight-specific status code. Used to encode some +/// additional status codes into an Arrow Status. enum class FlightStatusCode : int8_t { /// An implementation error has occurred. Internal, @@ -774,5 +778,81 @@ class ARROW_FLIGHT_EXPORT SimpleResultStream : public ResultStream { size_t position_; }; +/// \defgroup flight-error Error Handling +/// Types for handling errors from RPCs. Flight uses a set of status +/// codes standardized across Flight implementations, so these types +/// let applications work directly with those codes instead of having +/// to translate to and from Arrow Status. +/// @{ + +/// \brief Abstract status code for an RPC as per the Flight +/// specification. +enum class TransportStatusCode { + /// \brief No error. + kOk = 0, + /// \brief An unknown error occurred. + kUnknown = 1, + /// \brief An error occurred in the transport implementation, or an + /// error internal to the service implementation occurred. + kInternal = 2, + /// \brief An argument is invalid. + kInvalidArgument = 3, + /// \brief The request timed out. + kTimedOut = 4, + /// \brief An argument is not necessarily invalid, but references + /// some resource that does not exist. Prefer over + /// kInvalidArgument where applicable. + kNotFound = 5, + /// \brief The request attempted to create some resource that does + /// not exist. + kAlreadyExists = 6, + /// \brief The request was explicitly cancelled. + kCancelled = 7, + /// \brief The client is not authenticated. + kUnauthenticated = 8, + /// \brief The client is not authorized to perform this request. + kUnauthorized = 9, + /// \brief The request is not implemented + kUnimplemented = 10, + /// \brief There is a network connectivity error, or some resource + /// is otherwise unavailable. Most likely a temporary condition. + kUnavailable = 11, +}; + +/// \brief Convert a code to a string. +std::string ToString(TransportStatusCode code); + +/// \brief An error from an RPC call, using Flight error codes directly +/// instead of trying to translate to Arrow Status. +/// +/// Currently, only attached to the Status passed to AsyncListener::OnFinish. +/// +/// This API is EXPERIMENTAL. +class ARROW_FLIGHT_EXPORT TransportStatusDetail : public StatusDetail { + public: + constexpr static const char* kTypeId = "flight::TransportStatusDetail"; + explicit TransportStatusDetail(TransportStatusCode code, std::string message, + std::vector> details) + : code_(code), message_(std::move(message)), details_(std::move(details)) {} + const char* type_id() const override { return kTypeId; } + std::string ToString() const override; + + static std::optional> Unwrap( + const Status& status); + + TransportStatusCode code() const { return code_; } + std::string_view message() const { return message_; } + const std::vector>& details() const { + return details_; + } + + private: + TransportStatusCode code_; + std::string message_; + std::vector> details_; +}; + +/// @} + } // namespace flight } // namespace arrow diff --git a/cpp/src/arrow/flight/types_async.h b/cpp/src/arrow/flight/types_async.h new file mode 100644 index 0000000000000..a241e64fb4e49 --- /dev/null +++ b/cpp/src/arrow/flight/types_async.h @@ -0,0 +1,80 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/flight/type_fwd.h" +#include "arrow/flight/types.h" +#include "arrow/ipc/options.h" +#include "arrow/type_fwd.h" + +namespace arrow::flight { + +/// \defgroup flight-async Async Flight Types +/// Common types used for asynchronous Flight APIs. +/// @{ + +/// \brief Non-templated state for an async RPC. +/// +/// This API is EXPERIMENTAL. +class ARROW_FLIGHT_EXPORT AsyncListenerBase { + public: + AsyncListenerBase(); + virtual ~AsyncListenerBase(); + + /// \brief Request cancellation of the RPC. + /// + /// The RPC is not cancelled until AsyncListener::OnFinish is called. + void TryCancel(); + + private: + friend class arrow::flight::internal::ClientTransport; + + /// Transport-specific state for this RPC. Transport + /// implementations may store and retrieve state here via + /// ClientTransport::SetAsyncRpc and ClientTransport::GetAsyncRpc. + std::unique_ptr rpc_state_; +}; + +/// \brief Callbacks for results from async RPCs. +/// +/// A single listener may not be used for multiple concurrent RPC +/// calls. The application MUST hold the listener alive until +/// OnFinish() is called and has finished. +/// +/// This API is EXPERIMENTAL. +template +class ARROW_FLIGHT_EXPORT AsyncListener : public AsyncListenerBase { + public: + /// \brief Get the next server result. + /// + /// This will never be called concurrently with itself or OnFinish. + virtual void OnNext(T message) = 0; + /// \brief Get the final status. + /// + /// This will never be called concurrently with itself or OnNext. If the + /// error comes from the remote server, then a TransportStatusDetail will be + /// attached. Otherwise, the error is generated by the client-side + /// transport and will not have a TransportStatusDetail. + virtual void OnFinish(Status status) = 0; +}; + +/// @} + +} // namespace arrow::flight From 6e6e6f0340672ed49fb8e7cddf7bc47f2ca360dd Mon Sep 17 00:00:00 2001 From: Jin Shang Date: Wed, 9 Aug 2023 23:11:15 +0800 Subject: [PATCH 127/749] GH-36931: [C++] Add cumulative_mean function (#36932) ### Rationale for this change Add `cumulative_mean` function ### What changes are included in this PR? Implement `cumulative_mean` function. The current cumulative_* kernel generator can only be based on a simple binary arithmetic op and the state can only be a single value. I refactored it to using of a generic state such that it can handle complex operations such as `mean`, `median`, `var` etc. ### Are these changes tested? Yes ### Are there any user-facing changes? No * Closes: #36931 Lead-authored-by: Jin Shang Co-authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/arrow/compute/api_vector.cc | 5 + cpp/src/arrow/compute/api_vector.h | 11 + .../compute/kernels/vector_cumulative_ops.cc | 195 +++++++++++++----- .../kernels/vector_cumulative_ops_test.cc | 100 ++++++++- docs/source/cpp/compute.rst | 34 +-- 5 files changed, 268 insertions(+), 77 deletions(-) diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc index f73b10e11edd7..d47ee42ebf239 100644 --- a/cpp/src/arrow/compute/api_vector.cc +++ b/cpp/src/arrow/compute/api_vector.cc @@ -417,5 +417,10 @@ Result CumulativeMin(const Datum& values, const CumulativeOptions& option return CallFunction("cumulative_min", {Datum(values)}, &options, ctx); } +Result CumulativeMean(const Datum& values, const CumulativeOptions& options, + ExecContext* ctx) { + return CallFunction("cumulative_mean", {Datum(values)}, &options, ctx); +} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index 4f226ac00788a..0233090ef6fb9 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -226,6 +226,7 @@ class ARROW_EXPORT CumulativeOptions : public FunctionOptions { /// - prod: 1 /// - min: maximum of the input type /// - max: minimum of the input type + /// - mean: start is ignored because it has no meaning for mean std::optional> start; /// If true, nulls in the input are ignored and produce a corresponding null output. @@ -661,6 +662,16 @@ Result CumulativeMin( const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(), ExecContext* ctx = NULLPTR); +/// \brief Compute the cumulative mean of an array-like object +/// +/// \param[in] values array-like input +/// \param[in] options configures cumulative mean behavior, `start` is ignored +/// \param[in] ctx the function execution context, optional +ARROW_EXPORT +Result CumulativeMean( + const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(), + ExecContext* ctx = NULLPTR); + /// \brief Return the first order difference of an array. /// /// Computes the first order difference of an array, i.e. diff --git a/cpp/src/arrow/compute/kernels/vector_cumulative_ops.cc b/cpp/src/arrow/compute/kernels/vector_cumulative_ops.cc index 82caa3bff59aa..86d2679486726 100644 --- a/cpp/src/arrow/compute/kernels/vector_cumulative_ops.cc +++ b/cpp/src/arrow/compute/kernels/vector_cumulative_ops.cc @@ -25,12 +25,11 @@ #include "arrow/compute/kernels/codegen_internal.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/result.h" +#include "arrow/type_traits.h" #include "arrow/util/bit_util.h" #include "arrow/visit_type_inline.h" -namespace arrow { -namespace compute { -namespace internal { +namespace arrow::compute::internal { namespace { @@ -63,19 +62,60 @@ struct CumulativeOptionsWrapper : public OptionsWrapper { } }; -// The driver kernel for all cumulative compute functions. Op is a compute kernel -// representing any binary associative operation with an identity element (add, product, -// min, max, etc.), i.e. ones that form a monoid, and OptionsType the options type -// corresponding to Op. ArgType and OutType are the input and output types, which will +// The cumulative value is computed based on a simple arithmetic binary op +// such as Add, Mul, Min, Max, etc. +template +struct CumulativeBinaryOp { + using OutType = ArgType; + using OutValue = typename GetOutputType::T; + using ArgValue = typename GetViewType::T; + + OutValue current_value; + + CumulativeBinaryOp() { current_value = Identity::template value; } + + explicit CumulativeBinaryOp(const std::shared_ptr start) { + current_value = UnboxScalar::Unbox(*start); + } + + OutValue Call(KernelContext* ctx, ArgValue arg, Status* st) { + current_value = + Op::template Call(ctx, arg, current_value, st); + return current_value; + } +}; + +template +struct CumulativeMean { + using OutType = DoubleType; + using ArgValue = typename GetViewType::T; + int64_t count = 0; + double sum = 0; + + CumulativeMean() = default; + + // start value is ignored for CumulativeMean + explicit CumulativeMean(const std::shared_ptr start) {} + + double Call(KernelContext* ctx, ArgValue arg, Status* st) { + sum += static_cast(arg); + ++count; + return sum / count; + } +}; + +// The driver kernel for all cumulative compute functions. +// ArgType and OutType are the input and output types, which will // normally be the same (e.g. the cumulative sum of an array of Int64Type will result in -// an array of Int64Type). -template +// an array of Int64Type) with the exception of CumulativeMean, which will always return +// a double. +template struct Accumulator { - using OutValue = typename GetOutputType::T; + using OutType = typename CumulativeState::OutType; using ArgValue = typename GetViewType::T; KernelContext* ctx; - ArgValue current_value; + CumulativeState current_state; bool skip_nulls; bool encountered_null = false; NumericBuilder builder; @@ -88,11 +128,7 @@ struct Accumulator { if (skip_nulls || (input.GetNullCount() == 0 && !encountered_null)) { VisitArrayValuesInline( input, - [&](ArgValue v) { - current_value = Op::template Call( - ctx, v, current_value, &st); - builder.UnsafeAppend(current_value); - }, + [&](ArgValue v) { builder.UnsafeAppend(current_state.Call(ctx, v, &st)); }, [&]() { builder.UnsafeAppendNull(); }); } else { int64_t nulls_start_idx = 0; @@ -100,9 +136,7 @@ struct Accumulator { input, [&](ArgValue v) { if (!encountered_null) { - current_value = Op::template Call( - ctx, v, current_value, &st); - builder.UnsafeAppend(current_value); + builder.UnsafeAppend(current_state.Call(ctx, v, &st)); ++nulls_start_idx; } }, @@ -115,16 +149,17 @@ struct Accumulator { } }; -template +template struct CumulativeKernel { + using OutType = typename CumulativeState::OutType; using OutValue = typename GetOutputType::T; static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { const auto& options = CumulativeOptionsWrapper::Get(ctx); - Accumulator accumulator(ctx); + Accumulator accumulator(ctx); if (options.start.has_value()) { - accumulator.current_value = UnboxScalar::Unbox(*(options.start.value())); + accumulator.current_state = CumulativeState(options.start.value()); } else { - accumulator.current_value = Identity::template value; + accumulator.current_state = CumulativeState(); } accumulator.skip_nulls = options.skip_nulls; @@ -138,16 +173,17 @@ struct CumulativeKernel { } }; -template +template struct CumulativeKernelChunked { + using OutType = typename CumulativeState::OutType; using OutValue = typename GetOutputType::T; static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { const auto& options = CumulativeOptionsWrapper::Get(ctx); - Accumulator accumulator(ctx); + Accumulator accumulator(ctx); if (options.start.has_value()) { - accumulator.current_value = UnboxScalar::Unbox(*(options.start.value())); + accumulator.current_state = CumulativeState(options.start.value()); } else { - accumulator.current_value = Identity::template value; + accumulator.current_state = CumulativeState(); } accumulator.skip_nulls = options.skip_nulls; @@ -217,11 +253,52 @@ const FunctionDoc cumulative_min_doc{ "start as the new minimum)."), {"values"}, "CumulativeOptions"}; -} // namespace -template -void MakeVectorCumulativeFunction(FunctionRegistry* registry, const std::string func_name, - const FunctionDoc doc) { +const FunctionDoc cumulative_mean_doc{ + "Compute the cumulative mean over a numeric input", + ("`values` must be numeric. Return an array/chunked array which is the\n" + "cumulative mean computed over `values`. CumulativeOptions::start_value is \n" + "ignored."), + {"values"}, + "CumulativeOptions"}; + +// Kernel factory for complex stateful computations. +template