From 5eea5c93d8441455d09bcc4ef679c6e3b05ab6eb Mon Sep 17 00:00:00 2001 From: Laurent Goujon Date: Tue, 23 Jul 2024 07:26:19 +0200 Subject: [PATCH 01/73] GH-41951: [Java] Add @FormatMethod annotations (#43376) ### What changes are included in this PR? Annotate several methods using format-like string with error-prone `@ FormatMethod` and `@ FormatString` annotations. Update error-prone version to 2.29.2 and remove unused error-prone javac version property. ### Are these changes tested? CI/CD ### Are there any user-facing changes? None * GitHub Issue: #41951 Authored-by: Laurent Goujon Signed-off-by: David Li --- java/memory/memory-core/pom.xml | 4 ++++ .../memory-core/src/main/java/module-info.java | 1 + .../org/apache/arrow/memory/BaseAllocator.java | 7 ++++--- .../apache/arrow/memory/util/HistoricalLog.java | 17 ++++++++--------- java/pom.xml | 9 +++++++-- 5 files changed, 24 insertions(+), 14 deletions(-) diff --git a/java/memory/memory-core/pom.xml b/java/memory/memory-core/pom.xml index ce78fc479232a..db1b0199bb72f 100644 --- a/java/memory/memory-core/pom.xml +++ b/java/memory/memory-core/pom.xml @@ -47,6 +47,10 @@ under the License. org.checkerframework checker-qual + + com.google.errorprone + error_prone_annotations + diff --git a/java/memory/memory-core/src/main/java/module-info.java b/java/memory/memory-core/src/main/java/module-info.java index e2a07626c386f..e74044ea41d4a 100644 --- a/java/memory/memory-core/src/main/java/module-info.java +++ b/java/memory/memory-core/src/main/java/module-info.java @@ -27,5 +27,6 @@ requires jsr305; requires static org.checkerframework.checker.qual; requires static org.immutables.value.annotations; + requires static com.google.errorprone.annotations; requires org.slf4j; } diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/BaseAllocator.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/BaseAllocator.java index 3f4426d2c36e5..dd6375e910b92 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/BaseAllocator.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/BaseAllocator.java @@ -16,6 +16,8 @@ */ package org.apache.arrow.memory; +import com.google.errorprone.annotations.FormatMethod; +import com.google.errorprone.annotations.FormatString; import java.util.Collection; import java.util.Collections; import java.util.HashSet; @@ -539,9 +541,8 @@ public String toVerboseString() { return sb.toString(); } - /* Remove @SuppressWarnings after fixing https://github.com/apache/arrow/issues/41951 */ - @SuppressWarnings("FormatStringAnnotation") - private void hist(String noteFormat, Object... args) { + @FormatMethod + private void hist(@FormatString String noteFormat, Object... args) { if (historicalLog != null) { historicalLog.recordEvent(noteFormat, args); } diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/HistoricalLog.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/HistoricalLog.java index 659ddde28df9b..5b1bdd8b7244c 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/HistoricalLog.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/HistoricalLog.java @@ -16,6 +16,8 @@ */ package org.apache.arrow.memory.util; +import com.google.errorprone.annotations.FormatMethod; +import com.google.errorprone.annotations.FormatString; import java.util.ArrayDeque; import java.util.Arrays; import java.util.Deque; @@ -42,9 +44,8 @@ public class HistoricalLog { * object instance is best. * @param args for the format string, or nothing if none are required */ - @SuppressWarnings("FormatStringAnnotation") - /* Remove @SuppressWarnings after fixing https://github.com/apache/arrow/issues/41951 */ - public HistoricalLog(final String idStringFormat, Object... args) { + @FormatMethod + public HistoricalLog(@FormatString final String idStringFormat, Object... args) { this(Integer.MAX_VALUE, idStringFormat, args); } @@ -65,9 +66,8 @@ public HistoricalLog(final String idStringFormat, Object... args) { * object instance is best. * @param args for the format string, or nothing if none are required */ - @SuppressWarnings("AnnotateFormatMethod") - public HistoricalLog(final int limit, final String idStringFormat, Object... args) { - // Remove @SuppressWarnings after fixing https://github.com/apache/arrow/issues/41951 + @FormatMethod + public HistoricalLog(final int limit, @FormatString final String idStringFormat, Object... args) { this.limit = limit; this.idString = String.format(idStringFormat, args); this.firstEvent = null; @@ -80,9 +80,8 @@ public HistoricalLog(final int limit, final String idStringFormat, Object... arg * @param noteFormat {@link String#format} format string that describes the event * @param args for the format string, or nothing if none are required */ - @SuppressWarnings("AnnotateFormatMethod") - public synchronized void recordEvent(final String noteFormat, Object... args) { - // Remove @SuppressWarnings after fixing https://github.com/apache/arrow/issues/41951 + @FormatMethod + public synchronized void recordEvent(@FormatString final String noteFormat, Object... args) { final String note = String.format(noteFormat, args); final Event event = new Event(note); if (firstEvent == null) { diff --git a/java/pom.xml b/java/pom.xml index 45acf9dd0c732..a6c1002adf382 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -106,8 +106,7 @@ under the License. 2 true - 9+181-r4173-1 - 2.28.0 + 2.29.2 5.11.0 5.2.0 3.45.0 @@ -160,6 +159,12 @@ under the License. jsr305 3.0.2 + + com.google.errorprone + error_prone_annotations + ${error_prone_core.version} + provided + org.slf4j slf4j-api From 7a149d8bd8dc4004575c2a5dda379dc5b29e9892 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 23 Jul 2024 14:34:51 +0900 Subject: [PATCH 02/73] MINOR: [Java] Bump com.h2database:h2 from 2.2.224 to 2.3.230 in /java (#43364) Bumps [com.h2database:h2](https://github.com/h2database/h2database) from 2.2.224 to 2.3.230.
Release notes

Sourced from com.h2database:h2's releases.

version-2.3.230

... (truncated)

Commits
  • cc4846b Merge pull request #4076 from andreitokar/release-2.3.230
  • 98f006e in preparation for release
  • b728c42 Merge pull request #4070 from katzyn/disk_space
  • 6375771 Documentation
  • 74d453b Add DB_OBJECT_APPROXIMATE_SIZE and DB_OBJECT_APPROXIMATE_TOTAL_SIZE
  • 575302d Merge pull request #4065 from vreuland/master
  • 1d9d0f5 Merge remote-tracking branch 'origin/master'
  • 0fd83a5 Adapt new backup test to be run only in "big" mode to avoid OOM in CI
  • 2241b38 Merge branch 'h2database:master' into master
  • 21b7a49 Fix "double mark" db corruption issue when opening backup with young dead chunks
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=com.h2database:h2&package-manager=maven&previous-version=2.2.224&new-version=2.3.230)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: David Li --- java/adapter/jdbc/pom.xml | 2 +- java/performance/pom.xml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/java/adapter/jdbc/pom.xml b/java/adapter/jdbc/pom.xml index e86dfcb0b0a52..124cc535c25bf 100644 --- a/java/adapter/jdbc/pom.xml +++ b/java/adapter/jdbc/pom.xml @@ -59,7 +59,7 @@ under the License. com.h2database h2 - 2.2.224 + 2.3.230 test diff --git a/java/performance/pom.xml b/java/performance/pom.xml index 0dfc26b469ce2..f6d3a26b4f352 100644 --- a/java/performance/pom.xml +++ b/java/performance/pom.xml @@ -75,7 +75,7 @@ under the License. com.h2database h2 - 2.2.224 + 2.3.230 runtime From aacb241126ab2b2cb9e064b5717a1f95e2cdf029 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 23 Jul 2024 14:35:19 +0900 Subject: [PATCH 03/73] MINOR: [Java] Bump com.github.luben:zstd-jni from 1.5.6-3 to 1.5.6-4 in /java (#43363) Bumps [com.github.luben:zstd-jni](https://github.com/luben/zstd-jni) from 1.5.6-3 to 1.5.6-4.
Commits
  • 76acf0f v1.5.6-4
  • 4f4896f Move the Mac OS target in the build script
  • 3252917 Revert "For Intel Mac OS target version 10.14"
  • 4cd2f81 For Intel Mac OS target version 10.14
  • c057b94 Add ability to specify the Zstd temp folder with a property
  • 0f657c0 Don't fork when running tests
  • caf0dd4 Update Jacoco plugin
  • 1951eda Collect coverage only on Linux
  • d349b10 Implement Objects.checkFromIndexSize
  • 7fd6382 Add checks for offset/size/length
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=com.github.luben:zstd-jni&package-manager=maven&previous-version=1.5.6-3&new-version=1.5.6-4)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: David Li --- java/compression/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/compression/pom.xml b/java/compression/pom.xml index 79105dbfccda5..8774f7cabde94 100644 --- a/java/compression/pom.xml +++ b/java/compression/pom.xml @@ -55,7 +55,7 @@ under the License. com.github.luben zstd-jni - 1.5.6-3 + 1.5.6-4 From a88f0cd0368f81f1b9243d443cd571d09242c5b1 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 23 Jul 2024 14:39:58 +0900 Subject: [PATCH 04/73] MINOR: [Java] Bump commons-codec:commons-codec from 1.17.0 to 1.17.1 in /java (#43367) Bumps [commons-codec:commons-codec](https://github.com/apache/commons-codec) from 1.17.0 to 1.17.1.
Changelog

Sourced from commons-codec:commons-codec's changelog.

Apache Commons Codec 1.17.1 RELEASE NOTES

The Apache Commons Codec component contains encoders and decoders for various formats such as Base16, Base32, Base64, digest, and Hexadecimal. In addition to these widely used encoders and decoders, the codec package also maintains a collection of phonetic encoding utilities.

Feature and fix release. Requires a minimum of Java 8.

Fixed Bugs

  •         Md5Crypt now throws IllegalArgumentException on an invalid prefix. Thanks to Gary Gregory.
    

Changes

  •         Bump org.apache.commons:commons-parent from 69 to 71 [#286](https://github.com/apache/commons-codec/issues/286). Thanks to Gary Gregory.
    
  •         Bump org.codehaus.mojo:animal-sniffer-maven-plugin from 1.23 to 1.24 [#293](https://github.com/apache/commons-codec/issues/293). Thanks to Dependabot.
    
  •         Bump org.codehaus.mojo:taglist-maven-plugin from 3.0.0 to 3.1.0 [#292](https://github.com/apache/commons-codec/issues/292). Thanks to Dependabot.
    

For complete information on Apache Commons Codec, including instructions on how to submit bug reports, patches, or suggestions for improvement, see the Apache Commons Codec website:

https://commons.apache.org/proper/commons-codec/

Download page: https://commons.apache.org/proper/commons-codec/download_codec.cgi


Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=commons-codec:commons-codec&package-manager=maven&previous-version=1.17.0&new-version=1.17.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: David Li --- java/vector/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/vector/pom.xml b/java/vector/pom.xml index 127a16511f01f..7a53bf754aae4 100644 --- a/java/vector/pom.xml +++ b/java/vector/pom.xml @@ -60,7 +60,7 @@ under the License. commons-codec commons-codec - 1.17.0 + 1.17.1 org.apache.arrow From de19af928a83c09d12cd0f3d83932bfa8cca9b19 Mon Sep 17 00:00:00 2001 From: "Seb. V" Date: Tue, 23 Jul 2024 13:47:26 +0200 Subject: [PATCH 05/73] GH-43359: [Go][Parquet] ReadRowGroups panics with canceled context (#43360) ### Rationale for this change `ReadRowGroups` needs to support externally canceled contexts, e.g. for request-scoped contexts in servers like gRPC. ### What changes are included in this PR? Additionnaly, `releaseColumns` needs to ignore columns with uninitialized data as it used in a `defer` statement. ### Are these changes tested? Yes: a new test `TestArrowReaderCanceledContext` is included. ### Are there any user-facing changes? None * GitHub Issue: #43359 Authored-by: sebdotv Signed-off-by: Joel Lubinitsky --- go/parquet/pqarrow/file_reader.go | 5 +++++ go/parquet/pqarrow/file_reader_test.go | 23 +++++++++++++++++++++++ go/parquet/pqarrow/helpers.go | 4 +++- 3 files changed, 31 insertions(+), 1 deletion(-) diff --git a/go/parquet/pqarrow/file_reader.go b/go/parquet/pqarrow/file_reader.go index 208ac9ceebadf..a2e84d9ce2795 100755 --- a/go/parquet/pqarrow/file_reader.go +++ b/go/parquet/pqarrow/file_reader.go @@ -18,6 +18,7 @@ package pqarrow import ( "context" + "errors" "fmt" "io" "sync" @@ -375,6 +376,10 @@ func (fr *FileReader) ReadRowGroups(ctx context.Context, indices, rowGroups []in data.data.Release() } + // if the context is in error, but we haven't set an error yet, then it means that the parent context + // was cancelled. In this case, we should exit early as some columns may not have been read yet. + err = errors.Join(err, ctx.Err()) + if err != nil { // if we encountered an error, consume any waiting data on the channel // so the goroutines don't leak and so memory can get cleaned up. we already diff --git a/go/parquet/pqarrow/file_reader_test.go b/go/parquet/pqarrow/file_reader_test.go index b7d178f8644de..fe5a4547a775c 100644 --- a/go/parquet/pqarrow/file_reader_test.go +++ b/go/parquet/pqarrow/file_reader_test.go @@ -167,6 +167,29 @@ func TestArrowReaderAdHocReadFloat16s(t *testing.T) { } } +func TestArrowReaderCanceledContext(t *testing.T) { + dataDir := getDataDir() + + mem := memory.NewCheckedAllocator(memory.DefaultAllocator) + defer mem.AssertSize(t, 0) + + filename := filepath.Join(dataDir, "int32_decimal.parquet") + require.FileExists(t, filename) + + rdr, err := file.OpenParquetFile(filename, false, file.WithReadProps(parquet.NewReaderProperties(mem))) + require.NoError(t, err) + defer rdr.Close() + arrowRdr, err := pqarrow.NewFileReader(rdr, pqarrow.ArrowReadProperties{}, mem) + require.NoError(t, err) + + // create a canceled context + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + _, err = arrowRdr.ReadTable(ctx) + require.ErrorIs(t, err, context.Canceled) +} + func TestRecordReaderParallel(t *testing.T) { mem := memory.NewCheckedAllocator(memory.DefaultAllocator) defer mem.AssertSize(t, 0) diff --git a/go/parquet/pqarrow/helpers.go b/go/parquet/pqarrow/helpers.go index 800cd84192005..237de4366c03e 100644 --- a/go/parquet/pqarrow/helpers.go +++ b/go/parquet/pqarrow/helpers.go @@ -38,6 +38,8 @@ func releaseArrayData(data []arrow.ArrayData) { func releaseColumns(columns []arrow.Column) { for _, col := range columns { - col.Release() + if col.Data() != nil { // data can be nil due to the way columns are constructed in ReadRowGroups + col.Release() + } } } From f95773d028b75b4a902a3c896e5e63e04dde0deb Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 23 Jul 2024 12:52:08 -0700 Subject: [PATCH 06/73] MINOR: [C#] Bump Grpc.Tools from 2.64.0 to 2.65.0 in /csharp (#43373) Bumps [Grpc.Tools](https://github.com/grpc/grpc) from 2.64.0 to 2.65.0.
Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=Grpc.Tools&package-manager=nuget&previous-version=2.64.0&new-version=2.65.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Curt Hagenlocher --- .../src/Apache.Arrow.Flight.Sql/Apache.Arrow.Flight.Sql.csproj | 2 +- csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/csharp/src/Apache.Arrow.Flight.Sql/Apache.Arrow.Flight.Sql.csproj b/csharp/src/Apache.Arrow.Flight.Sql/Apache.Arrow.Flight.Sql.csproj index ee6d42c8d17fc..1870888184906 100644 --- a/csharp/src/Apache.Arrow.Flight.Sql/Apache.Arrow.Flight.Sql.csproj +++ b/csharp/src/Apache.Arrow.Flight.Sql/Apache.Arrow.Flight.Sql.csproj @@ -5,7 +5,7 @@ - + diff --git a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj index 5030d37cdb16d..9ad99894e1ce3 100644 --- a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj +++ b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj @@ -7,7 +7,7 @@ - + From c53a93216651f2940c328c91b20f5afc5f379530 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 23 Jul 2024 14:13:15 -0700 Subject: [PATCH 07/73] MINOR: [C#] Bump Grpc.Net.Client and System.Runtime.CompilerServices.Unsafe in /csharp (#43372) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [Grpc.Net.Client](https://github.com/grpc/grpc-dotnet) and [System.Runtime.CompilerServices.Unsafe](https://github.com/dotnet/runtime). These dependencies needed to be updated together. Updates `Grpc.Net.Client` from 2.63.0 to 2.64.0
Release notes

Sourced from Grpc.Net.Client's releases.

Release v2.64.0

What's Changed

Full Changelog: https://github.com/grpc/grpc-dotnet/compare/v2.63.0...v2.64.0

Release v2.64.0-pre1

What's Changed

Full Changelog: https://github.com/grpc/grpc-dotnet/compare/v2.63.0...v2.64.0-pre1

Commits

Updates `System.Runtime.CompilerServices.Unsafe` from 4.7.1 to 6.0.0
Release notes

Sourced from System.Runtime.CompilerServices.Unsafe's releases.

.NET 6.0

Release

.NET 6.0 RC 2

Release

.NET 6.0 RC 1

Release

.NET 6.0 Preview 7

Release

.NET 6.0 Preview 6

Release

.NET 6.0 Preview 5

Release

.NET 6.0 Preview 4

Release

.NET 6.0 Preview 3

Release

.NET 6.0 Preview 2

Release

.NET 6.0 Preview 1

Release

.NET 5.0.17

Release

.NET 5 is now out of support. We recommend using .NET 6.

.NET 5.0.16

Release

.NET 5.0.15

Release

.NET 5.0.14

Release

.NET 5.0.13

Release

.NET 5.0.11

Release

... (truncated)

Commits

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Curt Hagenlocher --- csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj index 9ad99894e1ce3..5334f877873e4 100644 --- a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj +++ b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj @@ -6,7 +6,7 @@ - + From bc9816704c43eef88f32139a2497b561c5177413 Mon Sep 17 00:00:00 2001 From: Alkis Evlogimenos Date: Wed, 24 Jul 2024 00:21:01 +0300 Subject: [PATCH 08/73] GH-43393: [C++][Parquet] parquet-dump-footer: Remove redundant link and fix --debug processing (#43375) ### Rationale for this change * We don't need to link to `libarrow` explicitly because `parquet_shared`/`parquet_static` does it automatically * `--help` shows `--debug` but the implementation accepts `--json` not `--debug` ### What changes are included in this PR? * Remove the redundant `libarrow` link * Accept `--debug` not `--json` ### Are these changes tested? Manually. ### Are there any user-facing changes? Yes. * GitHub Issue: #43393 Authored-by: Alkis Evlogimenos Signed-off-by: Sutou Kouhei --- cpp/src/parquet/metadata.cc | 4 ++-- cpp/tools/parquet/CMakeLists.txt | 1 - cpp/tools/parquet/parquet_dump_footer.cc | 14 +++++++------- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 7bab9104619ce..139793219df90 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -1053,8 +1053,8 @@ std::shared_ptr FileMetaData::Subset( return impl_->Subset(row_groups); } -std::string FileMetaData::SerializeUnencrypted(bool scrub, bool json) const { - return impl_->SerializeUnencrypted(scrub, json); +std::string FileMetaData::SerializeUnencrypted(bool scrub, bool debug) const { + return impl_->SerializeUnencrypted(scrub, debug); } void FileMetaData::WriteTo(::arrow::io::OutputStream* dst, diff --git a/cpp/tools/parquet/CMakeLists.txt b/cpp/tools/parquet/CMakeLists.txt index e05645da28a0e..87c3254607589 100644 --- a/cpp/tools/parquet/CMakeLists.txt +++ b/cpp/tools/parquet/CMakeLists.txt @@ -31,7 +31,6 @@ if(PARQUET_BUILD_EXECUTABLES) install(TARGETS ${TOOL} ${INSTALL_IS_OPTIONAL} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) endforeach(TOOL) - target_link_libraries(parquet-dump-footer ${ARROW_LIBRARIES}) add_dependencies(parquet ${PARQUET_TOOLS}) endif() diff --git a/cpp/tools/parquet/parquet_dump_footer.cc b/cpp/tools/parquet/parquet_dump_footer.cc index c7a4b78fdd823..4dd7476bc8ea3 100644 --- a/cpp/tools/parquet/parquet_dump_footer.cc +++ b/cpp/tools/parquet/parquet_dump_footer.cc @@ -38,7 +38,7 @@ void AppendLE32(uint32_t v, std::string* out) { out->append(reinterpret_cast(&v), sizeof(v)); } -int DoIt(std::string in, bool scrub, bool json, std::string out) { +int DoIt(std::string in, bool scrub, bool debug, std::string out) { std::string path; auto fs = ::arrow::fs::FileSystemFromUriOrPath(in, &path).ValueOrDie(); auto file = fs->OpenInputFile(path).ValueOrDie(); @@ -73,8 +73,8 @@ int DoIt(std::string in, bool scrub, bool json, std::string out) { file->ReadAt(file_len - tail_len, tail_len, data).ValueOrDie(); } auto md = FileMetaData::Make(tail.data(), &metadata_len); - std::string ser = md->SerializeUnencrypted(scrub, json); - if (!json) { + std::string ser = md->SerializeUnencrypted(scrub, debug); + if (!debug) { AppendLE32(static_cast(ser.size()), &ser); ser.append("PAR1", 4); } @@ -107,7 +107,7 @@ static int PrintHelp() { int main(int argc, char** argv) { bool scrub = true; - bool json = false; + bool debug = false; std::string in; std::string out; for (int i = 1; i < argc; i++) { @@ -116,8 +116,8 @@ int main(int argc, char** argv) { return PrintHelp(); } else if (!std::strcmp(arg, "--no-scrub")) { scrub = false; - } else if (!std::strcmp(arg, "--json")) { - json = true; + } else if (!std::strcmp(arg, "--debug")) { + debug = true; } else if (!std::strcmp(arg, "--in")) { if (i + 1 >= argc) return PrintHelp(); in = argv[++i]; @@ -131,5 +131,5 @@ int main(int argc, char** argv) { } if (in.empty()) return PrintHelp(); - return parquet::DoIt(in, scrub, json, out); + return parquet::DoIt(in, scrub, debug, out); } From aebe427f1e774d0a22f39b1e9e9de453383b944c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 23 Jul 2024 14:22:59 -0700 Subject: [PATCH 09/73] MINOR: [C#] Bump Grpc.AspNetCore.Server from 2.63.0 to 2.64.0 in /csharp (#43371) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [Grpc.AspNetCore.Server](https://github.com/grpc/grpc-dotnet) from 2.63.0 to 2.64.0.
Release notes

Sourced from Grpc.AspNetCore.Server's releases.

Release v2.64.0

What's Changed

Full Changelog: https://github.com/grpc/grpc-dotnet/compare/v2.63.0...v2.64.0

Release v2.64.0-pre1

What's Changed

Full Changelog: https://github.com/grpc/grpc-dotnet/compare/v2.63.0...v2.64.0-pre1

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=Grpc.AspNetCore.Server&package-manager=nuget&previous-version=2.63.0&new-version=2.64.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Curt Hagenlocher --- .../Apache.Arrow.Flight.AspNetCore.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csharp/src/Apache.Arrow.Flight.AspNetCore/Apache.Arrow.Flight.AspNetCore.csproj b/csharp/src/Apache.Arrow.Flight.AspNetCore/Apache.Arrow.Flight.AspNetCore.csproj index 2dd1d9d8f98e2..ac1f8c9bae77a 100644 --- a/csharp/src/Apache.Arrow.Flight.AspNetCore/Apache.Arrow.Flight.AspNetCore.csproj +++ b/csharp/src/Apache.Arrow.Flight.AspNetCore/Apache.Arrow.Flight.AspNetCore.csproj @@ -5,7 +5,7 @@ - + From 097ec3d730040f1ecd3b5ecfd994f473b94ea2eb Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 23 Jul 2024 17:58:42 -0400 Subject: [PATCH 10/73] MINOR: [Go] Bump github.com/hamba/avro/v2 from 2.22.1 to 2.23.0 in /go (#43370) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [github.com/hamba/avro/v2](https://github.com/hamba/avro) from 2.22.1 to 2.23.0.
Release notes

Sourced from github.com/hamba/avro/v2's releases.

v2.23.0

What's Changed

New Contributors

Full Changelog: https://github.com/hamba/avro/compare/v2.22.1...v2.23.0

Commits
  • 8ea2833 feat: conditionally validate names of Record, Enums and Fixed. (#415)
  • 582d9c9 docs: who uses this (#418)
  • bee5c03 docs: add who uses this (#417)
  • e5a97bb chore: bump golang.org/x/tools from 0.22.0 to 0.23.0 in the all group (#416)
  • 66aad10 feat: add support for recursive schemas & structs (#413)
  • e2e849d fix: allow multi-line doc, and custom fields (#411)
  • a76983d chore: bump github.com/klauspost/compress from 1.17.8 to 1.17.9 in the all gr...
  • 8fff381 feat: output doc comments for package, structs, and fields in avrogen (#405)
  • 067d037 docs: update godoc link (#409)
  • b7f2440 chore: bump golang.org/x/tools from 0.21.0 to 0.22.0 in the all group (#408)
  • See full diff in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=github.com/hamba/avro/v2&package-manager=go_modules&previous-version=2.22.1&new-version=2.23.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go/go.mod | 2 +- go/go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go/go.mod b/go/go.mod index 1c730cc87709b..af387176ead29 100644 --- a/go/go.mod +++ b/go/go.mod @@ -47,7 +47,7 @@ require ( require ( github.com/google/uuid v1.6.0 - github.com/hamba/avro/v2 v2.22.1 + github.com/hamba/avro/v2 v2.23.0 github.com/huandu/xstrings v1.4.0 github.com/substrait-io/substrait-go v0.4.2 github.com/tidwall/sjson v1.2.5 diff --git a/go/go.sum b/go/go.sum index 6ce51c83350a0..e2dfc19652116 100644 --- a/go/go.sum +++ b/go/go.sum @@ -43,8 +43,8 @@ github.com/google/pprof v0.0.0-20221118152302-e6195bd50e26 h1:Xim43kblpZXfIBQsbu github.com/google/pprof v0.0.0-20221118152302-e6195bd50e26/go.mod h1:dDKJzRmX4S37WGHujM7tX//fmj1uioxKzKxz3lo4HJo= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/hamba/avro/v2 v2.22.1 h1:q1rAbfJsrbMaZPDLQvwUQMfQzp6H+hGXvckmU/lXemk= -github.com/hamba/avro/v2 v2.22.1/go.mod h1:HOeTrE3kvWnBAgsufqhAzDDV5gvS0QXs65Z6BHfGgbg= +github.com/hamba/avro/v2 v2.23.0 h1:DYWz6UqNCi21JflaZlcwNfW+rK+D/CwnrWWJtfmO4vw= +github.com/hamba/avro/v2 v2.23.0/go.mod h1:7vDfy/2+kYCE8WUHoj2et59GTv0ap7ptktMXu0QHePI= github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM= From 6c9f08eee365f2688c8ddd53ba9616d82d8b06b1 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 23 Jul 2024 18:05:35 -0400 Subject: [PATCH 11/73] MINOR: [Go] Bump github.com/substrait-io/substrait-go from 0.4.2 to 0.5.0 in /go (#43369) Bumps [github.com/substrait-io/substrait-go](https://github.com/substrait-io/substrait-go) from 0.4.2 to 0.5.0.
Release notes

Sourced from github.com/substrait-io/substrait-go's releases.

v0.5.0 (2024-07-20)

Features

  • builders Remove requirement for VirtualTableReadRel having columns (#31) (1c9dd98)
    • feat(builders): Remove requirement for VirtualTableReadRel having columns

    • remove commented out code

Documentation changes

  • Update package doc substrait version (d283e0b)

Changes to the build process or auxiliary tools and libraries such as documentation generation

  • substrait Bump substrait proto version (#25) (3760bc3)
    • Update substrait proto dependency to v0.33.0

    • go generate

Commits
  • 9ae8754 feat(plan) Introduce rewrite tools on Rel. (#29)
  • 1c9dd98 feat(builders): Remove requirement for VirtualTableReadRel having columns (#31)
  • ddc550e Add starting gitignore (#30)
  • d283e0b docs: Update package doc substrait version
  • 3760bc3 chore(substrait): Bump substrait proto version (#25)
  • See full diff in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=github.com/substrait-io/substrait-go&package-manager=go_modules&previous-version=0.4.2&new-version=0.5.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go/go.mod | 2 +- go/go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go/go.mod b/go/go.mod index af387176ead29..43c2c41b69eca 100644 --- a/go/go.mod +++ b/go/go.mod @@ -49,7 +49,7 @@ require ( github.com/google/uuid v1.6.0 github.com/hamba/avro/v2 v2.23.0 github.com/huandu/xstrings v1.4.0 - github.com/substrait-io/substrait-go v0.4.2 + github.com/substrait-io/substrait-go v0.5.0 github.com/tidwall/sjson v1.2.5 ) diff --git a/go/go.sum b/go/go.sum index e2dfc19652116..a96f0a3797c74 100644 --- a/go/go.sum +++ b/go/go.sum @@ -99,8 +99,8 @@ github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/ github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= -github.com/substrait-io/substrait-go v0.4.2 h1:buDnjsb3qAqTaNbOR7VKmNgXf4lYQxWEcnSGUWBtmN8= -github.com/substrait-io/substrait-go v0.4.2/go.mod h1:qhpnLmrcvAnlZsUyPXZRqldiHapPTXC3t7xFgDi3aQg= +github.com/substrait-io/substrait-go v0.5.0 h1:8sYsoqcrzoNpThPyot1CQpwF6OokxvplLUQJTGlKws4= +github.com/substrait-io/substrait-go v0.5.0/go.mod h1:Co7ko6iIjdqCGcN3LfkKWPVlxONkNZem9omWAGIaOrQ= github.com/tidwall/gjson v1.14.2 h1:6BBkirS0rAHjumnjHF6qgy5d2YAJ1TLIaFE2lzfOLqo= github.com/tidwall/gjson v1.14.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA= From 4835a3cec8f1420e9515cfabdd757126574c9342 Mon Sep 17 00:00:00 2001 From: Laurent Goujon Date: Wed, 24 Jul 2024 03:25:10 +0200 Subject: [PATCH 12/73] GH-43396: [Java] Remove/replace jsr305 (#43397) ### Rationale for this change jsr305 is not maintained anymore and is unlikely to support JPMS. The classes are also in the javax. namespace which is known to cause issues as well. ### What changes are included in this PR? Replace most uses of jsr305 with the equivalent checker framework annotations, and remove usage of `@ ThreadSafe.` ### Are these changes tested? CI/CD ### Are there any user-facing changes? None * GitHub Issue: #43396 Authored-by: Laurent Goujon Signed-off-by: David Li --- java/flight/flight-core/src/main/java/module-info.java | 1 - java/flight/flight-sql-jdbc-core/pom.xml | 6 ++---- .../driver/jdbc/client/ArrowFlightSqlClientHandler.java | 2 +- java/memory/memory-core/pom.xml | 4 ---- java/memory/memory-core/src/main/java/module-info.java | 1 - .../src/main/java/org/apache/arrow/memory/Accountant.java | 2 -- java/pom.xml | 5 ----- 7 files changed, 3 insertions(+), 18 deletions(-) diff --git a/java/flight/flight-core/src/main/java/module-info.java b/java/flight/flight-core/src/main/java/module-info.java index ff0d7427b59cc..e668fe6149fb9 100644 --- a/java/flight/flight-core/src/main/java/module-info.java +++ b/java/flight/flight-core/src/main/java/module-info.java @@ -35,7 +35,6 @@ requires io.netty.common; requires io.netty.handler; requires io.netty.transport; - requires jsr305; requires org.apache.arrow.format; requires org.apache.arrow.memory.core; requires org.apache.arrow.vector; diff --git a/java/flight/flight-sql-jdbc-core/pom.xml b/java/flight/flight-sql-jdbc-core/pom.xml index 4833d30dbc33f..502d866fcc0bd 100644 --- a/java/flight/flight-sql-jdbc-core/pom.xml +++ b/java/flight/flight-sql-jdbc-core/pom.xml @@ -132,10 +132,8 @@ under the License.
- com.google.code.findbugs - jsr305 - 3.0.2 - compile + org.checkerframework + checker-qual diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/client/ArrowFlightSqlClientHandler.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/client/ArrowFlightSqlClientHandler.java index 845f5372d3f74..0e9c79a0907a5 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/client/ArrowFlightSqlClientHandler.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/client/ArrowFlightSqlClientHandler.java @@ -29,7 +29,6 @@ import java.util.Map; import java.util.Optional; import java.util.Set; -import javax.annotation.Nullable; import org.apache.arrow.driver.jdbc.client.utils.ClientAuthenticationUtils; import org.apache.arrow.flight.CallOption; import org.apache.arrow.flight.CallStatus; @@ -61,6 +60,7 @@ import org.apache.arrow.vector.VectorSchemaRoot; import org.apache.arrow.vector.types.pojo.Schema; import org.apache.calcite.avatica.Meta.StatementType; +import org.checkerframework.checker.nullness.qual.Nullable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/java/memory/memory-core/pom.xml b/java/memory/memory-core/pom.xml index db1b0199bb72f..b9fa8ab1a6942 100644 --- a/java/memory/memory-core/pom.xml +++ b/java/memory/memory-core/pom.xml @@ -31,10 +31,6 @@ under the License. Core off-heap memory management libraries for Arrow ValueVectors. - - com.google.code.findbugs - jsr305 - org.slf4j slf4j-api diff --git a/java/memory/memory-core/src/main/java/module-info.java b/java/memory/memory-core/src/main/java/module-info.java index e74044ea41d4a..0a607bdf2f43a 100644 --- a/java/memory/memory-core/src/main/java/module-info.java +++ b/java/memory/memory-core/src/main/java/module-info.java @@ -24,7 +24,6 @@ requires java.compiler; requires transitive jdk.unsupported; - requires jsr305; requires static org.checkerframework.checker.qual; requires static org.immutables.value.annotations; requires static com.google.errorprone.annotations; diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/Accountant.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/Accountant.java index 5a31f4cd1914a..5d052c2cdeeec 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/Accountant.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/Accountant.java @@ -17,7 +17,6 @@ package org.apache.arrow.memory; import java.util.concurrent.atomic.AtomicLong; -import javax.annotation.concurrent.ThreadSafe; import org.apache.arrow.util.Preconditions; import org.checkerframework.checker.nullness.qual.Nullable; @@ -25,7 +24,6 @@ * Provides a concurrent way to manage account for memory usage without locking. Used as basis for * Allocators. All operations are threadsafe (except for close). */ -@ThreadSafe class Accountant implements AutoCloseable { /** The parent allocator. */ diff --git a/java/pom.xml b/java/pom.xml index a6c1002adf382..7ba75af164eb6 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -154,11 +154,6 @@ under the License. flatbuffers-java ${dep.fbs.version} - - com.google.code.findbugs - jsr305 - 3.0.2 - com.google.errorprone error_prone_annotations From a49dbcc8bda170f48f3cc07f5453105129015db0 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Wed, 24 Jul 2024 11:11:43 -0400 Subject: [PATCH 13/73] GH-43394: [Java][Benchmarking] Fix Java benchmarks for Java 17+ (#43395) ### Rationale for this change Now that Arrow Java has moved from Java 8 -> Java 11, we need to add `--add-opens` when running Java. ### What changes are included in this PR? * Add `--add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED` to `_JAVA_OPTIONS` in archery. * Clean up test code only used for Java 8 ### Are these changes tested? To be verified via CI and ursabot. ### Are there any user-facing changes? No * GitHub Issue: #43394 Authored-by: Dane Pitkin Signed-off-by: Dane Pitkin --- .../archery/integration/tester_java.py | 29 +++---------------- dev/archery/archery/lang/java.py | 12 +++++++- 2 files changed, 15 insertions(+), 26 deletions(-) diff --git a/dev/archery/archery/integration/tester_java.py b/dev/archery/archery/integration/tester_java.py index 9b14c6939cde8..8d207d3393730 100644 --- a/dev/archery/archery/integration/tester_java.py +++ b/dev/archery/archery/integration/tester_java.py @@ -46,6 +46,7 @@ def load_version_from_pom(): _JAVA_OPTS = [ "-Dio.netty.tryReflectionSetAccessible=true", "-Darrow.struct.conflict.policy=CONFLICT_APPEND", + "--add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED", # GH-39113: avoid failures accessing files in `/tmp/hsperfdata_...` "-XX:-UsePerfData", ] @@ -88,24 +89,13 @@ def setup_jpype(): import jpype jar_path = f"{_ARROW_TOOLS_JAR}:{_ARROW_C_DATA_JAR}" # XXX Didn't manage to tone down the logging level here (DEBUG -> INFO) - java_opts = _JAVA_OPTS[:] - proc = subprocess.run( - ['java', '--add-opens'], - stderr=subprocess.PIPE, - stdout=subprocess.PIPE, - text=True) - if 'Unrecognized option: --add-opens' not in proc.stderr: - # Java 9+ - java_opts.append( - '--add-opens=java.base/java.nio=' - 'org.apache.arrow.memory.core,ALL-UNNAMED') jpype.startJVM(jpype.getDefaultJVMPath(), "-Djava.class.path=" + jar_path, # This flag is too heavy for IPC and Flight tests "-Darrow.memory.debug.allocator=true", # Reduce internal use of signals by the JVM "-Xrs", - *java_opts) + *_JAVA_OPTS) class _CDataBase: @@ -253,20 +243,9 @@ class JavaTester(Tester): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - # Detect whether we're on Java 8 or Java 9+ self._java_opts = _JAVA_OPTS[:] - proc = subprocess.run( - ['java', '--add-opens'], - stderr=subprocess.PIPE, - stdout=subprocess.PIPE, - text=True) - if 'Unrecognized option: --add-opens' not in proc.stderr: - # Java 9+ - self._java_opts.append( - '--add-opens=java.base/java.nio=' - 'org.apache.arrow.memory.core,ALL-UNNAMED') - self._java_opts.append( - '--add-reads=org.apache.arrow.flight.core=ALL-UNNAMED') + self._java_opts.append( + '--add-reads=org.apache.arrow.flight.core=ALL-UNNAMED') def _run(self, arrow_path=None, json_path=None, command='VALIDATE'): cmd = ( diff --git a/dev/archery/archery/lang/java.py b/dev/archery/archery/lang/java.py index bc169adf647bc..0087208d34f98 100644 --- a/dev/archery/archery/lang/java.py +++ b/dev/archery/archery/lang/java.py @@ -34,8 +34,11 @@ def __init__(self, jar, *args, **kwargs): class JavaConfiguration: - def __init__(self, + REQUIRED_JAVA_OPTIONS = [ + "--add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED", + ] + def __init__(self, # toolchain java_home=None, java_options=None, # build & benchmark @@ -43,6 +46,13 @@ def __init__(self, self.java_home = java_home self.java_options = java_options + if self.java_options is None: + self.java_options = " ".join(self.REQUIRED_JAVA_OPTIONS) + else: + for option in self.REQUIRED_JAVA_OPTIONS: + if option not in self.java_options: + self.java_options += " " + option + self.build_extras = list(build_extras) if build_extras else [] self.benchmark_extras = list( benchmark_extras) if benchmark_extras else [] From 14384ac1865aa4a62511457d1a70f98d4c5d0402 Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Wed, 24 Jul 2024 11:12:27 -0400 Subject: [PATCH 14/73] MINOR: [Python] Remove extra import from PyCapsule Interface Doc (#43399) ### Rationale for this change The PyCapsule Interface doc includes the line ```py from typing_extensions import Self ``` but `Self` is not used anywhere in the typing declarations, so that line can be removed. ### What changes are included in this PR? ### Are these changes tested? ### Are there any user-facing changes? No Authored-by: Kyle Barron Signed-off-by: Dane Pitkin --- docs/source/format/CDataInterface/PyCapsuleInterface.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/source/format/CDataInterface/PyCapsuleInterface.rst b/docs/source/format/CDataInterface/PyCapsuleInterface.rst index d38ba2822da46..f4f6b54849e77 100644 --- a/docs/source/format/CDataInterface/PyCapsuleInterface.rst +++ b/docs/source/format/CDataInterface/PyCapsuleInterface.rst @@ -303,7 +303,6 @@ function accepts an object implementing one of these protocols. .. code-block:: python from typing import Tuple, Protocol - from typing_extensions import Self class ArrowSchemaExportable(Protocol): def __arrow_c_schema__(self) -> object: ... From 674e221f41c602c8f71c7a2c8e53e7c7c11b1ede Mon Sep 17 00:00:00 2001 From: Laurent Goujon Date: Wed, 24 Jul 2024 11:59:47 -0700 Subject: [PATCH 15/73] MINOR: [Java] Bump org.apache:apache from 31 to 33 in /java (#43405) Bumps [org.apache:apache](https://github.com/apache/maven-apache-parent) from 31 to 33. - [Release notes](https://github.com/apache/maven-apache-parent/releases) - [Commits](https://github.com/apache/maven-apache-parent/commits) Clean up Maven plugin version overrides --- updated-dependencies: - dependency-name: org.apache:apache dependency-type: direct:production update-type: version-update:semver-major ... Authored-by: Laurent Goujon Signed-off-by: Dane Pitkin --- java/bom/pom.xml | 12 +----------- java/pom.xml | 11 +---------- 2 files changed, 2 insertions(+), 21 deletions(-) diff --git a/java/bom/pom.xml b/java/bom/pom.xml index ad6532b1192bb..fe3264102144b 100644 --- a/java/bom/pom.xml +++ b/java/bom/pom.xml @@ -23,7 +23,7 @@ under the License. org.apache apache - 31 + 33 org.apache.arrow @@ -83,16 +83,6 @@ under the License. 11 11 11 - 3.12.0 - 3.2.5 - 0.16.1 - 3.7.1 - 3.12.1 - 3.6.1 - 3.2.4 - 3.2.2 - 3.6.3 - 3.5.0 diff --git a/java/pom.xml b/java/pom.xml index 7ba75af164eb6..be49b6610f3f6 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -23,7 +23,7 @@ under the License. org.apache apache - 31 + 33 org.apache.arrow @@ -117,20 +117,11 @@ under the License. 11 11 11 - 3.12.0 - 3.2.5 - 0.16.1 - 3.7.1 - 3.12.1 - 3.6.1 - 3.2.4 3.2.2 - 3.6.3 - 3.5.0 From e2d4dbfa43ea570c1d3234776f4ea44170f1792f Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Thu, 25 Jul 2024 09:14:50 -0400 Subject: [PATCH 16/73] GH-43412: [Java][Benchmarking] Use JDK_JAVA_OPTIONS for JVM arguments (#43411) ### Rationale for this change Java options are not getting picked up properly in archery. ### What changes are included in this PR? * Use JDK_JAVA_OPTIONS instead of _JAVA_OPTIONS env var. ### Are these changes tested? Test via benchmark bot (ursabot) ### Are there any user-facing changes? No * GitHub Issue: #43412 Authored-by: Dane Pitkin Signed-off-by: Dane Pitkin --- dev/archery/archery/lang/java.py | 2 +- docs/source/java/install.rst | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/dev/archery/archery/lang/java.py b/dev/archery/archery/lang/java.py index 0087208d34f98..f447b352e6a6c 100644 --- a/dev/archery/archery/lang/java.py +++ b/dev/archery/archery/lang/java.py @@ -73,7 +73,7 @@ def environment(self): env["JAVA_HOME"] = self.java_home if self.java_options: - env["JAVA_OPTIONS"] = self.java_options + env["JDK_JAVA_OPTIONS"] = self.java_options return env diff --git a/docs/source/java/install.rst b/docs/source/java/install.rst index 3e01f72a56878..c238690c6b930 100644 --- a/docs/source/java/install.rst +++ b/docs/source/java/install.rst @@ -40,7 +40,7 @@ adding ``--add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED # Directly on the command line $ java --add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED -jar ... # Indirectly via environment variables - $ env _JAVA_OPTIONS="--add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED" java -jar ... + $ env JDK_JAVA_OPTIONS="--add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED" java -jar ... Otherwise, you may see errors like ``module java.base does not "opens java.nio" to unnamed module`` or ``module java.base does not "opens @@ -58,7 +58,7 @@ Modifying the command above for Flight: # Directly on the command line $ java --add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED -jar ... # Indirectly via environment variables - $ env _JAVA_OPTIONS="--add-reads=org.apache.arrow.flight.core=ALL-UNNAMED --add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED" java -jar ... + $ env JDK_JAVA_OPTIONS="--add-reads=org.apache.arrow.flight.core=ALL-UNNAMED --add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED" java -jar ... Otherwise, you may see errors like ``java.lang.IllegalAccessError: superclass access check failed: class org.apache.arrow.flight.ArrowMessage$ArrowBufRetainingCompositeByteBuf (in module org.apache.arrow.flight.core) @@ -73,7 +73,7 @@ Modifying the command above for arrow-memory: # Directly on the command line $ java --add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED -jar ... # Indirectly via environment variables - $ env _JAVA_OPTIONS="--add-opens=java.base/java.nio=org.apache.arrow.dataset,org.apache.arrow.memory.core,ALL-UNNAMED" java -jar ... + $ env JDK_JAVA_OPTIONS="--add-opens=java.base/java.nio=org.apache.arrow.dataset,org.apache.arrow.memory.core,ALL-UNNAMED" java -jar ... Otherwise you may see errors such as ``java.lang.RuntimeException: java.lang.reflect.InaccessibleObjectException: Unable to make static void java.nio.Bits.reserveMemory(long,long) accessible: module @@ -216,7 +216,7 @@ Or they can be added via environment variable, for example when executing your c .. code-block:: - _JAVA_OPTIONS="--add-opens=java.base/java.nio=ALL-UNNAMED" mvn exec:java -Dexec.mainClass="YourMainCode" + JDK_JAVA_OPTIONS="--add-opens=java.base/java.nio=ALL-UNNAMED" mvn exec:java -Dexec.mainClass="YourMainCode" Installing from Source ====================== From fc075ad736964897d2ad1320bfb2303b84251f1c Mon Sep 17 00:00:00 2001 From: Laurent Goujon Date: Thu, 25 Jul 2024 06:40:14 -0700 Subject: [PATCH 17/73] GH-43380: [Java] Add support for cross jdk version testing (#43381) ### Rationale for this change This change allows to use a different JDK version for tests than the one used to build the project. ### What changes are included in this PR? Provided a new property `arrow.test.jdk-version` which specify a JDK version to be used by surefire/failsafe plugins instead of the version used to execute Maven. As part of the change, also add a Java version for `TestOpens` to only be executed if Java runtime version is 16 or greater Also add a Testing section to the Java developer documentation ### Are these changes tested? via CI/CD ### Are there any user-facing changes? New build property `arrow.test.jdk-version` allows developers to specify the JDK version used for tests * GitHub Issue: #43380 Lead-authored-by: Laurent Goujon Co-authored-by: Laurent Goujon Co-authored-by: David Li Co-authored-by: Dane Pitkin Signed-off-by: Dane Pitkin --- docs/source/developers/java/building.rst | 48 +++++++++++++++ java/flight/flight-core/pom.xml | 2 + java/flight/flight-sql-jdbc-driver/pom.xml | 1 + java/flight/flight-sql/pom.xml | 2 + java/memory/memory-core/pom.xml | 58 +++++++----------- .../org/apache/arrow/memory/TestOpens.java | 5 +- java/memory/memory-netty/pom.xml | 1 + java/pom.xml | 60 ++++++++++++++++++- java/vector/pom.xml | 1 + 9 files changed, 140 insertions(+), 38 deletions(-) diff --git a/docs/source/developers/java/building.rst b/docs/source/developers/java/building.rst index 63a7b4369b809..3904841de9c5a 100644 --- a/docs/source/developers/java/building.rst +++ b/docs/source/developers/java/building.rst @@ -321,6 +321,54 @@ Building Java JNI Modules -Darrow.c.jni.dist.dir=/java-dist/lib/ \ -Parrow-jni clean install +Testing +======= + +By default, Maven uses the same Java version to both build the code and run the tests. + +It is also possible to use a different JDK version for the tests. This requires Maven +toolchains to be configured beforehand, and then a specific test property needs to be set. + +Configuring Maven toolchains +---------------------------- + +To be able to use a JDK version for testing, it needs to be registered first in Maven ``toolchains.xml`` +configuration file usually located under ``${HOME}/.m2`` with the following snippet added to it: + + .. code-block:: + + + + + [...] + + + jdk + + 21 + temurin + + + path/to/jdk/home + + + + [...] + + + +Testing with a specific JDK +--------------------------- + +To run Arrow tests with a specific JDK version, use the ``arrow.test.jdk-version`` property. + +For example, to run Arrow tests with JDK 17, use the following snippet: + + .. code-block:: + + $ cd arrow/java + $ mvn -Darrow.test.jdk-version=17 clean verify + IDE Configuration ================= diff --git a/java/flight/flight-core/pom.xml b/java/flight/flight-core/pom.xml index d4083383a2f44..be3c191654a58 100644 --- a/java/flight/flight-core/pom.xml +++ b/java/flight/flight-core/pom.xml @@ -32,6 +32,8 @@ under the License. 1 + + --add-opens=org.apache.arrow.flight.core/org.apache.arrow.flight.perf.impl=protobuf.java --add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED diff --git a/java/flight/flight-sql-jdbc-driver/pom.xml b/java/flight/flight-sql-jdbc-driver/pom.xml index 3dfe3bcd33f50..148319e5d9d64 100644 --- a/java/flight/flight-sql-jdbc-driver/pom.xml +++ b/java/flight/flight-sql-jdbc-driver/pom.xml @@ -59,6 +59,7 @@ under the License. maven-failsafe-plugin + default-it integration-test verify diff --git a/java/flight/flight-sql/pom.xml b/java/flight/flight-sql/pom.xml index 81bdc1a25636a..c9c589d202ac6 100644 --- a/java/flight/flight-sql/pom.xml +++ b/java/flight/flight-sql/pom.xml @@ -32,6 +32,8 @@ under the License. 1 + + --add-reads=org.apache.arrow.flight.sql=org.slf4j --add-reads=org.apache.arrow.flight.core=ALL-UNNAMED --add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED diff --git a/java/memory/memory-core/pom.xml b/java/memory/memory-core/pom.xml index b9fa8ab1a6942..9b24cee032023 100644 --- a/java/memory/memory-core/pom.xml +++ b/java/memory/memory-core/pom.xml @@ -30,6 +30,11 @@ under the License. Arrow Memory - Core Core off-heap memory management libraries for Arrow ValueVectors. + + + --add-reads=org.apache.arrow.memory.core=ch.qos.logback.classic --add-opens=java.base/java.lang.reflect=org.apache.arrow.memory.core --add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED + + org.slf4j @@ -85,42 +90,25 @@ under the License. **/TestOpens.java + + + + opens-tests + + test + + test + + + + + + **/TestOpens.java + + + +
- - - - opens-tests - - - [16,] - - - - - org.apache.maven.plugins - maven-surefire-plugin - - - opens-tests - - test - - test - - - - - - **/TestOpens.java - - - - - - - - - diff --git a/java/memory/memory-core/src/test/java/org/apache/arrow/memory/TestOpens.java b/java/memory/memory-core/src/test/java/org/apache/arrow/memory/TestOpens.java index 756aa2919789b..b5e0a71e7ee0e 100644 --- a/java/memory/memory-core/src/test/java/org/apache/arrow/memory/TestOpens.java +++ b/java/memory/memory-core/src/test/java/org/apache/arrow/memory/TestOpens.java @@ -18,12 +18,15 @@ import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.condition.JRE.JAVA_16; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.condition.EnabledForJreRange; public class TestOpens { /** Instantiating the RootAllocator should poke MemoryUtil and fail. */ @Test + @EnabledForJreRange(min = JAVA_16) public void testMemoryUtilFailsLoudly() { // This test is configured by Maven to run WITHOUT add-opens. So this should fail on JDK16+ // (where JEP396 means that add-opens is required to access JDK internals). @@ -44,6 +47,6 @@ public void testMemoryUtilFailsLoudly() { break; } } - assertTrue(found, "Expected exception as not thrown"); + assertTrue(found, "Expected exception was not thrown"); } } diff --git a/java/memory/memory-netty/pom.xml b/java/memory/memory-netty/pom.xml index e29ca3a4d053c..f2d4d2d0fe3bc 100644 --- a/java/memory/memory-netty/pom.xml +++ b/java/memory/memory-netty/pom.xml @@ -78,6 +78,7 @@ under the License. maven-failsafe-plugin + default-it integration-test verify diff --git a/java/pom.xml b/java/pom.xml index be49b6610f3f6..35c2a433033bf 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -112,6 +112,8 @@ under the License. 3.45.0 none -Xdoclint:none + + --add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED 11 11 @@ -303,7 +305,7 @@ under the License. maven-surefire-plugin - --add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED + ${surefire.add-opens.argLine} true true ${forkCount} @@ -322,7 +324,7 @@ under the License. maven-failsafe-plugin - --add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED + ${surefire.add-opens.argLine} ${project.build.directory} true @@ -1265,5 +1267,59 @@ under the License. + + + + cross-jdk-testing + + + arrow.test.jdk-version + + + + + + maven-enforcer-plugin + + + check-jdk-version-property + + enforce + + validate + + + + arrow.test.jdk-version + "JDK version used for test must be specified." + ^\d{2,} + "JDK version used for test must be 11, 17, 21, ..." + + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + + + ${arrow.test.jdk-version} + + + + + org.apache.maven.plugins + maven-failsafe-plugin + + + ${arrow.test.jdk-version} + + + + + + diff --git a/java/vector/pom.xml b/java/vector/pom.xml index 7a53bf754aae4..73d76fc7306ae 100644 --- a/java/vector/pom.xml +++ b/java/vector/pom.xml @@ -188,6 +188,7 @@ under the License. + default-it integration-test verify From ad4d490f926c9d36ba0d679aa142a94db12fb647 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 25 Jul 2024 12:01:29 -0400 Subject: [PATCH 18/73] MINOR: [Java] Bump io.netty:netty-bom from 4.1.110.Final to 4.1.112.Final in /java (#43366) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [io.netty:netty-bom](https://github.com/netty/netty) from 4.1.110.Final to 4.1.112.Final.
Commits
  • ebe2aa5 [maven-release-plugin] prepare release netty-4.1.112.Final
  • 5e2d384 Only include scopeId on link-local addresses when using native transport (#14...
  • 5c0b0d5 Validate HTTP version while decoding (#14187)
  • b360abc Allow HTTP responses without reason-phrase (#14183)
  • a6bf424 ScheduledFutureTask: avoid invoke system clock again (#14162)
  • 3881103 Fix checkstyle errors introduced by fixing merge conflicts
  • 3c06dd6 Reject request if NUL is present in the request line (#14180)
  • e6a78dd Don't strip whitespaces from header names and let the validator handl… (#14179)
  • c5d3d72 Reject http header values with non SP / HTAB chars (#14178)
  • d05af24 Fix potential DNS cache invalidation across different EventLoops (#14147)
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=io.netty:netty-bom&package-manager=maven&previous-version=4.1.110.Final&new-version=4.1.112.Final)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Dane Pitkin --- java/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/pom.xml b/java/pom.xml index 35c2a433033bf..376fe2589f5ee 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -96,7 +96,7 @@ under the License. 5.10.3 2.0.13 33.2.1-jre - 4.1.110.Final + 4.1.112.Final 1.65.0 3.25.1 2.17.2 From bde199a0b4f59952e2ab88a1e83ed2fa1e34b746 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 25 Jul 2024 12:18:01 -0400 Subject: [PATCH 19/73] MINOR: [Java] Bump org.assertj:assertj-core from 3.26.0 to 3.26.3 in /java (#43263) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [org.assertj:assertj-core](https://github.com/assertj/assertj) from 3.26.0 to 3.26.3.
Release notes

Sourced from org.assertj:assertj-core's releases.

v3.26.3

:jigsaw: Binary Compatibility

The release is:

  • Binary compatible with the previous minor version.
  • Binary incompatible with the previous patch version.

:boom: Breaking Changes

Core

  • Replace assertThat(Temporal) with assertThatTemporal(Temporal) #3519

:bug: Bug Fixes

Core

  • Fix Javadoc rendering on FactoryBasedNavigableListAssert::assertThat
  • Allow ComparingNormalizedFields instances to be reused across different assertions #3493

:hammer: Dependency Upgrades

Core

  • Upgrade to Byte Buddy 1.14.18 #3531
  • Upgrade to JUnit BOM 5.10.3 #3525

Guava

  • Upgrade to Guava 33.2.1-jre #3499

:heart: Contributors

Thanks to all the contributors who worked on this release:

@​genuss

Commits
  • 8e97f90 [maven-release-plugin] prepare release assertj-build-3.26.3
  • d1afefc chore(deps): bump com.github.spotbugs:spotbugs-maven-plugin from 4.8.6.1 to 4...
  • 2dc2cbf chore(deps): bump byte-buddy.version from 1.14.17 to 1.14.18 (#3531)
  • 2541d3c chore(deps-dev): bump com.fasterxml.jackson.core:jackson-databind from 2.17.1...
  • cdb906f [maven-release-plugin] prepare for next development iteration
  • c3b1f4a [maven-release-plugin] prepare release assertj-build-3.26.2
  • d5b52ab [maven-release-plugin] prepare for next development iteration
  • 17ea711 [maven-release-plugin] prepare release assertj-build-3.26.1
  • 8cf054d chore(deps): bump org.codehaus.mojo:versions-maven-plugin from 2.16.2 to 2.17...
  • 5e708b4 chore(deps-dev): bump org.apache.groovy:groovy from 4.0.21 to 4.0.22 (#3527)
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=org.assertj:assertj-core&package-manager=maven&previous-version=3.26.0&new-version=3.26.3)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Dane Pitkin --- java/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/pom.xml b/java/pom.xml index 376fe2589f5ee..997257c71b6e9 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -171,7 +171,7 @@ under the License. org.assertj assertj-core - 3.26.0 + 3.26.3 test From 1f67c1a16a426d27a52d9aa31fc1b39602bad161 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 25 Jul 2024 13:22:51 -0300 Subject: [PATCH 20/73] GH-43291: [C++] Expand the 'take' function tests to cover more chunked-array cases (#43292) ### Rationale for this change #41700 (as it is currently) passes all the C++ tests even though it contains a few bugs (caught by manual repro steps and tests of of the Ruby bindings). The C++ tests should be able to catch these kinds of bugs and exercise code beyond the TakeAAA cases. ### What changes are included in this PR? - Explicitly calling out which TakeXX variation is being checked in tests and assert helpers - Using `AssertChunkedEqual` instead of `AssertChunkedEquivalent` (via `AssertDatumsEqual`) - ### Are these changes tested? Yes. The improved tests catch bugs now. * GitHub Issue: #43291 Authored-by: Felipe Oliveira Carvalho Signed-off-by: Felipe Oliveira Carvalho --- .../compute/kernels/vector_selection_test.cc | 1037 ++++++++++------- 1 file changed, 608 insertions(+), 429 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/vector_selection_test.cc b/cpp/src/arrow/compute/kernels/vector_selection_test.cc index aba016d6b7e8d..b38f3fcbd8ccd 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_test.cc @@ -28,6 +28,7 @@ #include "arrow/chunked_array.h" #include "arrow/compute/api.h" #include "arrow/compute/kernels/test_util.h" +#include "arrow/scalar.h" #include "arrow/table.h" #include "arrow/testing/builder.h" #include "arrow/testing/fixed_width_test_util.h" @@ -1101,33 +1102,114 @@ TEST(TestFilterMetaFunction, ArityChecking) { // ---------------------------------------------------------------------- // Take tests +// +// Shorthand notation (as defined in `TakeMetaFunction`): +// +// A = Array +// C = ChunkedArray +// R = RecordBatch +// T = Table +// +// (e.g. TakeCAC = Take(ChunkedArray, Array) -> ChunkedArray) +// +// The interface implemented by `TakeMetaFunction` is: +// +// Take(A, A) -> A (TakeAAA) +// Take(A, C) -> C (TakeACC) +// Take(C, A) -> C (TakeCAC) +// Take(C, C) -> C (TakeCCC) +// Take(R, A) -> R (TakeRAR) +// Take(T, A) -> T (TakeTAT) +// Take(T, C) -> T (TakeTCT) +// +// The tests extend the notation with a few "union kinds": +// +// X = Array | ChunkedArray +// +// Examples: +// +// TakeXA = {TakeAAA, TakeCAC}, +// TakeXX = {TakeAAA, TakeACC, TakeCAC, TakeCCC} +namespace { -void AssertTakeArrays(const std::shared_ptr& values, - const std::shared_ptr& indices, - const std::shared_ptr& expected) { - ASSERT_OK_AND_ASSIGN(std::shared_ptr actual, Take(*values, *indices)); - ValidateOutput(actual); - AssertArraysEqual(*expected, *actual, /*verbose=*/true); +Result> TakeAAA(const Array& values, const Array& indices) { + ARROW_ASSIGN_OR_RAISE(Datum out, Take(Datum(values), Datum(indices))); + return out.make_array(); } -Status TakeJSON(const std::shared_ptr& type, const std::string& values, - const std::shared_ptr& index_type, const std::string& indices, - std::shared_ptr* out) { - return Take(*ArrayFromJSON(type, values), *ArrayFromJSON(index_type, indices)) - .Value(out); +Result> TakeAAA( + const std::shared_ptr& type, const std::string& values, + const std::string& indices, const std::shared_ptr& index_type = int32()) { + return TakeAAA(*ArrayFromJSON(type, values), *ArrayFromJSON(index_type, indices)); } -void DoCheckTake(const std::shared_ptr& values, - const std::shared_ptr& indices, - const std::shared_ptr& expected) { - AssertTakeArrays(values, indices, expected); +// TakeACC is never tested directly, so it is not defined here + +Result TakeCAC(std::shared_ptr values, + std::shared_ptr indices) { + return Take(Datum{std::move(values)}, Datum{std::move(indices)}); +} + +Result TakeCAC(const std::shared_ptr& type, + const std::vector& values, const std::string& indices, + const std::shared_ptr& index_type = int8()) { + return TakeCAC(ChunkedArrayFromJSON(type, values), ArrayFromJSON(index_type, indices)); +} + +Result TakeCCC(std::shared_ptr values, + std::shared_ptr indices) { + return Take(Datum{std::move(values)}, Datum{std::move(indices)}); +} + +Result TakeCCC(const std::shared_ptr& type, + const std::vector& values, + const std::vector& indices) { + return TakeCCC(ChunkedArrayFromJSON(type, values), + ChunkedArrayFromJSON(int8(), indices)); +} + +Result TakeRAR(const std::shared_ptr& schm, const std::string& batch_json, + const std::string& indices, + const std::shared_ptr& index_type = int8()) { + auto batch = RecordBatchFromJSON(schm, batch_json); + return Take(Datum{std::move(batch)}, Datum{ArrayFromJSON(index_type, indices)}); +} + +Result TakeTAT(const std::shared_ptr& schm, + const std::vector& values, const std::string& indices, + const std::shared_ptr& index_type = int8()) { + return Take(Datum{TableFromJSON(schm, values)}, + Datum{ArrayFromJSON(index_type, indices)}); +} + +Result TakeTCT(const std::shared_ptr& schm, + const std::vector& values, + const std::vector& indices) { + return Take(Datum{TableFromJSON(schm, values)}, + Datum{ChunkedArrayFromJSON(int8(), indices)}); +} + +// Assert helpers for Take tests + +void DoAssertTakeAAA(const std::shared_ptr& values, + const std::shared_ptr& indices, + const std::shared_ptr& expected) { + ASSERT_OK_AND_ASSIGN(std::shared_ptr actual, TakeAAA(*values, *indices)); + ValidateOutput(actual); + AssertArraysEqual(*expected, *actual, /*verbose=*/true); +} + +void DoCheckTakeAAA(const std::shared_ptr& values, + const std::shared_ptr& indices, + const std::shared_ptr& expected) { + DoAssertTakeAAA(values, indices, expected); // Check sliced values ASSERT_OK_AND_ASSIGN(auto values_filler, MakeArrayOfNull(values->type(), 2)); ASSERT_OK_AND_ASSIGN(auto values_sliced, Concatenate({values_filler, values, values_filler})); values_sliced = values_sliced->Slice(2, values->length()); - AssertTakeArrays(values_sliced, indices, expected); + DoAssertTakeAAA(values_sliced, indices, expected); // Check sliced indices ASSERT_OK_AND_ASSIGN(auto zero, MakeScalar(indices->type(), int8_t{0})); @@ -1135,33 +1217,171 @@ void DoCheckTake(const std::shared_ptr& values, ASSERT_OK_AND_ASSIGN(auto indices_sliced, Concatenate({indices_filler, indices, indices_filler})); indices_sliced = indices_sliced->Slice(3, indices->length()); - AssertTakeArrays(values, indices_sliced, expected); -} - -void CheckTake(const std::shared_ptr& type, const std::string& values_json, - const std::string& indices_json, const std::string& expected_json) { + DoAssertTakeAAA(values, indices_sliced, expected); +} + +void DoCheckTakeCACWithArrays(const std::shared_ptr& values, + const std::shared_ptr& indices, + const std::shared_ptr& expected) { + auto pool = default_memory_pool(); + const bool indices_null_count_is_known = indices->null_count() != kUnknownNullCount; + + // We check TakeCAC by checking this equality: + // + // TakeAAA(Concat(V, V, V), I') == Concat(TakeCAC([V, V, V], I')) + // where + // V = values + // I = indices + // I' = Concat(I + 2 * V.length, I, I + V.length) + auto values3 = ArrayVector{values, values, values}; + ASSERT_OK_AND_ASSIGN(auto concat_values3, Concatenate(values3, pool)); + auto chunked_values3 = std::make_shared(values3); + std::shared_ptr concat_indices3; + { + auto double_length = + MakeScalar(indices->type(), static_cast(2 * values->length())); + auto zero = MakeScalar(indices->type(), 0); + auto length = MakeScalar(indices->type(), static_cast(values->length())); + ASSERT_OK_AND_ASSIGN(auto indices_prefix, Add(indices, *double_length)); + ASSERT_OK_AND_ASSIGN(auto indices_middle, Add(indices, *zero)); + ASSERT_OK_AND_ASSIGN(auto indices_suffix, Add(indices, *length)); + auto indices3 = ArrayVector{ + indices_prefix.make_array(), + indices_middle.make_array(), + indices_suffix.make_array(), + }; + ASSERT_OK_AND_ASSIGN(concat_indices3, Concatenate(indices3, pool)); + // Preserve the fact that indices->null_count() is unknown if it is unknown. + if (!indices_null_count_is_known) { + concat_indices3->data()->null_count = kUnknownNullCount; + } + } + ASSERT_OK_AND_ASSIGN(auto concat_expected3, + Concatenate({expected, expected, expected})); + ASSERT_OK_AND_ASSIGN(Datum chunked_actual, TakeCAC(chunked_values3, concat_indices3)); + ValidateOutput(chunked_actual); + ASSERT_OK_AND_ASSIGN(auto concat_actual, + Concatenate(chunked_actual.chunked_array()->chunks())); + AssertArraysEqual(*concat_expected3, *concat_actual, /*verbose=*/true); + + // We check TakeCAC again by checking this equality: + // + // TakeAAA(V, I) == Concat(TakeCAC(C, I)) + // where + // K = V.length // 4 + // C = [V.slice(0, K), V.slice(K, 2*K), V.slice(3*K, N - 3*K)] + // V = values + // I = indices + const int64_t n = values->length(); + const int64_t k = n / 4; + if (k > 0) { + auto value_slices = ArrayVector{values->Slice(0, k), values->Slice(k, 2 * k), + values->Slice(3 * k, n - k)}; + auto chunked_values = std::make_shared(value_slices); + ASSERT_OK_AND_ASSIGN(chunked_actual, TakeCAC(chunked_values, indices)); + ValidateOutput(chunked_actual); + ASSERT_OK_AND_ASSIGN(concat_actual, + Concatenate(chunked_actual.chunked_array()->chunks())); + AssertArraysEqual(*concat_actual, *expected, /*verbose=*/true); + } +} + +// TakeXA = {TakeAAA, TakeCAC} +void DoCheckTakeXA(const std::shared_ptr& values, + const std::shared_ptr& indices, + const std::shared_ptr& expected) { + DoCheckTakeAAA(values, indices, expected); + DoCheckTakeCACWithArrays(values, indices, expected); +} + +// TakeXA = {TakeAAA, TakeCAC} +void CheckTakeXA(const std::shared_ptr& type, const std::string& values_json, + const std::string& indices_json, const std::string& expected_json) { auto values = ArrayFromJSON(type, values_json); auto expected = ArrayFromJSON(type, expected_json); for (auto index_type : {int8(), uint32()}) { auto indices = ArrayFromJSON(index_type, indices_json); - DoCheckTake(values, indices, expected); + DoCheckTakeXA(values, indices, expected); } } -void AssertTakeNull(const std::string& values, const std::string& indices, - const std::string& expected) { - CheckTake(null(), values, indices, expected); +void CheckTakeXADictionary(std::shared_ptr value_type, + const std::string& dictionary_values, + const std::string& dictionary_indices, + const std::string& indices, + const std::string& expected_indices) { + auto dict = ArrayFromJSON(value_type, dictionary_values); + auto type = dictionary(int8(), value_type); + ASSERT_OK_AND_ASSIGN( + auto values, + DictionaryArray::FromArrays(type, ArrayFromJSON(int8(), dictionary_indices), dict)); + ASSERT_OK_AND_ASSIGN( + auto expected, + DictionaryArray::FromArrays(type, ArrayFromJSON(int8(), expected_indices), dict)); + auto take_indices = ArrayFromJSON(int8(), indices); + DoCheckTakeXA(values, take_indices, expected); } -void AssertTakeBoolean(const std::string& values, const std::string& indices, - const std::string& expected) { - CheckTake(boolean(), values, indices, expected); +void AssertTakeCAC(const std::shared_ptr& type, + const std::vector& values, const std::string& indices, + const std::vector& expected) { + ASSERT_OK_AND_ASSIGN(auto actual, TakeCAC(type, values, indices)); + ValidateOutput(actual); + AssertChunkedEqual(*ChunkedArrayFromJSON(type, expected), *actual.chunked_array()); } +void AssertTakeCCC(const std::shared_ptr& type, + const std::vector& values, + const std::vector& indices, + const std::vector& expected) { + ASSERT_OK_AND_ASSIGN(auto actual, TakeCCC(type, values, indices)); + ValidateOutput(actual); + AssertChunkedEqual(*ChunkedArrayFromJSON(type, expected), *actual.chunked_array()); +} + +void CheckTakeXCC(const Datum& values, const std::vector& indices, + const std::vector& expected) { + EXPECT_TRUE(values.is_array() || values.is_chunked_array()); + auto idx = ChunkedArrayFromJSON(int32(), indices); + ASSERT_OK_AND_ASSIGN(auto actual, Take(values, Datum{idx})); + ValidateOutput(actual); + AssertChunkedEqual(*ChunkedArrayFromJSON(values.type(), expected), + *actual.chunked_array()); +} + +void AssertTakeRAR(const std::shared_ptr& schm, const std::string& batch_json, + const std::string& indices, const std::string& expected_batch) { + for (auto index_type : {int8(), uint32()}) { + ASSERT_OK_AND_ASSIGN(auto actual, TakeRAR(schm, batch_json, indices, index_type)); + ValidateOutput(actual); + ASSERT_BATCHES_EQUAL(*RecordBatchFromJSON(schm, expected_batch), + *actual.record_batch()); + } +} + +void AssertTakeTAT(const std::shared_ptr& schm, + const std::vector& table_json, const std::string& filter, + const std::vector& expected_table) { + ASSERT_OK_AND_ASSIGN(auto actual, TakeTAT(schm, table_json, filter)); + ValidateOutput(actual); + ASSERT_TABLES_EQUAL(*TableFromJSON(schm, expected_table), *actual.table()); +} + +void AssertTakeTCT(const std::shared_ptr& schm, + const std::vector& table_json, + const std::vector& filter, + const std::vector& expected_table) { + ASSERT_OK_AND_ASSIGN(auto actual, TakeTCT(schm, table_json, filter)); + ValidateOutput(actual); + ASSERT_TABLES_EQUAL(*TableFromJSON(schm, expected_table), *actual.table()); +} + +// Validators used by random data tests + template -void ValidateTakeImpl(const std::shared_ptr& values, - const std::shared_ptr& indices, - const std::shared_ptr& result) { +void ValidateTakeXAImpl(const std::shared_ptr& values, + const std::shared_ptr& indices, + const std::shared_ptr& result) { using ValuesArrayType = typename TypeTraits::ArrayType; using IndexArrayType = typename TypeTraits::ArrayType; auto typed_values = checked_pointer_cast(values); @@ -1185,39 +1405,45 @@ void ValidateTakeImpl(const std::shared_ptr& values, << i; } } + // DoCheckTakeCACWithArrays transforms the indices which has a risk of + // overflow, so we only call it if the index type is not too wide. + if (indices->type()->byte_width() <= 4) { + auto cast_options = CastOptions::Safe(TypeHolder{int64()}); + ASSERT_OK_AND_ASSIGN(auto indices64, Cast(indices, cast_options)); + DoCheckTakeCACWithArrays(values, indices64.make_array(), /*expected=*/result); + } } template -void ValidateTake(const std::shared_ptr& values, - const std::shared_ptr& indices) { - ASSERT_OK_AND_ASSIGN(Datum out, Take(values, indices)); - auto taken = out.make_array(); +void ValidateTakeXA(const std::shared_ptr& values, + const std::shared_ptr& indices) { + ASSERT_OK_AND_ASSIGN(auto taken, TakeAAA(*values, *indices)); ValidateOutput(taken); ASSERT_EQ(indices->length(), taken->length()); switch (indices->type_id()) { case Type::INT8: - ValidateTakeImpl(values, indices, taken); + ValidateTakeXAImpl(values, indices, taken); break; case Type::INT16: - ValidateTakeImpl(values, indices, taken); + ValidateTakeXAImpl(values, indices, taken); break; case Type::INT32: - ValidateTakeImpl(values, indices, taken); + ValidateTakeXAImpl(values, indices, taken); break; case Type::INT64: - ValidateTakeImpl(values, indices, taken); + ValidateTakeXAImpl(values, indices, taken); break; case Type::UINT8: - ValidateTakeImpl(values, indices, taken); + ValidateTakeXAImpl(values, indices, taken); break; case Type::UINT16: - ValidateTakeImpl(values, indices, taken); + ValidateTakeXAImpl(values, indices, taken); break; case Type::UINT32: - ValidateTakeImpl(values, indices, taken); + ValidateTakeXAImpl(values, indices, taken); break; case Type::UINT64: - ValidateTakeImpl(values, indices, taken); + ValidateTakeXAImpl(values, indices, taken); break; default: FAIL() << "Invalid index type"; @@ -1225,6 +1451,8 @@ void ValidateTake(const std::shared_ptr& values, } } +// ---- + template T GetMaxIndex(int64_t values_length) { int64_t max_index = values_length - 1; @@ -1239,13 +1467,15 @@ uint64_t GetMaxIndex(int64_t values_length) { return static_cast(values_length - 1); } +} // namespace + class TestTakeKernel : public ::testing::Test { - public: - void TestNoValidityBitmapButUnknownNullCount(const std::shared_ptr& values, - const std::shared_ptr& indices) { + private: + void DoTestNoValidityBitmapButUnknownNullCount(const std::shared_ptr& values, + const std::shared_ptr& indices) { ASSERT_EQ(values->null_count(), 0); ASSERT_EQ(indices->null_count(), 0); - auto expected = (*Take(values, indices)).make_array(); + ASSERT_OK_AND_ASSIGN(auto expected, TakeAAA(*values, *indices)); auto new_values = MakeArray(values->data()->Copy()); new_values->data()->buffers[0].reset(); @@ -1253,67 +1483,95 @@ class TestTakeKernel : public ::testing::Test { auto new_indices = MakeArray(indices->data()->Copy()); new_indices->data()->buffers[0].reset(); new_indices->data()->null_count = kUnknownNullCount; - auto result = (*Take(new_values, new_indices)).make_array(); - - AssertArraysEqual(*expected, *result); + DoCheckTakeXA(new_values, new_indices, expected); } - void TestNoValidityBitmapButUnknownNullCount(const std::shared_ptr& type, - const std::string& values, - const std::string& indices) { - TestNoValidityBitmapButUnknownNullCount(ArrayFromJSON(type, values), - ArrayFromJSON(int16(), indices)); + public: + void DoTestNoValidityBitmapButUnknownNullCount( + const std::shared_ptr& type, const std::string& values, + const std::string& indices, std::shared_ptr index_type = int8()) { + DoTestNoValidityBitmapButUnknownNullCount(ArrayFromJSON(type, values), + ArrayFromJSON(index_type, indices)); } void TestNumericBasics(const std::shared_ptr& type) { ARROW_SCOPED_TRACE("type = ", *type); - CheckTake(type, "[7, 8, 9]", "[]", "[]"); - CheckTake(type, "[7, 8, 9]", "[0, 1, 0]", "[7, 8, 7]"); - CheckTake(type, "[null, 8, 9]", "[0, 1, 0]", "[null, 8, null]"); - CheckTake(type, "[7, 8, 9]", "[null, 1, 0]", "[null, 8, 7]"); - CheckTake(type, "[null, 8, 9]", "[]", "[]"); - CheckTake(type, "[7, 8, 9]", "[0, 0, 0, 0, 0, 0, 2]", "[7, 7, 7, 7, 7, 7, 9]"); - + CheckTakeXA(type, "[7, 8, 9]", "[]", "[]"); + CheckTakeXA(type, "[7, 8, 9]", "[0, 1, 0]", "[7, 8, 7]"); + CheckTakeXA(type, "[null, 8, 9]", "[0, 1, 0]", "[null, 8, null]"); + CheckTakeXA(type, "[7, 8, 9]", "[null, 1, 0]", "[null, 8, 7]"); + CheckTakeXA(type, "[null, 8, 9]", "[]", "[]"); + CheckTakeXA(type, "[7, 8, 9]", "[0, 0, 0, 0, 0, 0, 2]", "[7, 7, 7, 7, 7, 7, 9]"); + + const std::string k789 = "[7, 8, 9]"; std::shared_ptr arr; - ASSERT_RAISES(IndexError, TakeJSON(type, "[7, 8, 9]", int8(), "[0, 9, 0]", &arr)); - ASSERT_RAISES(IndexError, TakeJSON(type, "[7, 8, 9]", int8(), "[0, -1, 0]", &arr)); + ASSERT_RAISES(IndexError, TakeAAA(type, k789, "[0, 9, 0]").Value(&arr)); + ASSERT_RAISES(IndexError, TakeAAA(type, k789, "[0, -1, 0]").Value(&arr)); + Datum chunked_arr; + ASSERT_RAISES(IndexError, + TakeCAC(type, {k789, k789}, "[0, 9, 0]").Value(&chunked_arr)); + ASSERT_RAISES(IndexError, + TakeCAC(type, {k789, k789}, "[0, -1, 0]").Value(&chunked_arr)); } }; template -class TestTakeKernelTyped : public TestTakeKernel {}; +class TestTakeKernelTyped : public TestTakeKernel { + protected: + virtual std::shared_ptr value_type() const { + if constexpr (is_parameter_free_type::value) { + return TypeTraits::type_singleton(); + } else { + EXPECT_TRUE(false) << "value_type() must be overridden for parameterized types"; + return nullptr; + } + } + + void TestNoValidityBitmapButUnknownNullCount( + const std::string& values, const std::string& indices, + const std::shared_ptr& index_type = int8()) { + return DoTestNoValidityBitmapButUnknownNullCount(this->value_type(), values, indices, + index_type); + } + + void CheckTakeXA(const std::string& values, const std::string& indices, + const std::string& expected) { + compute::CheckTakeXA(this->value_type(), values, indices, expected); + } +}; + +static const char kNull3[] = "[null, null, null]"; TEST_F(TestTakeKernel, TakeNull) { - AssertTakeNull("[null, null, null]", "[0, 1, 0]", "[null, null, null]"); - AssertTakeNull("[null, null, null]", "[0, 2]", "[null, null]"); + CheckTakeXA(null(), kNull3, "[0, 1, 0]", "[null, null, null]"); + CheckTakeXA(null(), kNull3, "[0, 2]", "[null, null]"); std::shared_ptr arr; + ASSERT_RAISES(IndexError, TakeAAA(null(), kNull3, "[0, 9, 0]").Value(&arr)); + ASSERT_RAISES(IndexError, TakeAAA(boolean(), kNull3, "[0, -1, 0]").Value(&arr)); + Datum chunked_arr; ASSERT_RAISES(IndexError, - TakeJSON(null(), "[null, null, null]", int8(), "[0, 9, 0]", &arr)); + TakeCAC(null(), {kNull3, kNull3}, "[0, 9, 0]").Value(&chunked_arr)); ASSERT_RAISES(IndexError, - TakeJSON(boolean(), "[null, null, null]", int8(), "[0, -1, 0]", &arr)); + TakeCAC(boolean(), {kNull3, kNull3}, "[0, -1, 0]").Value(&chunked_arr)); } TEST_F(TestTakeKernel, InvalidIndexType) { std::shared_ptr arr; - ASSERT_RAISES(NotImplemented, TakeJSON(null(), "[null, null, null]", float32(), - "[0.0, 1.0, 0.1]", &arr)); + ASSERT_RAISES(NotImplemented, + TakeAAA(null(), kNull3, "[0.0, 1.0, 0.1]", float32()).Value(&arr)); + Datum chunked_arr; + ASSERT_RAISES(NotImplemented, + TakeCAC(null(), {kNull3, kNull3}, "[0.0, 1.0, 0.1]", float32()) + .Value(&chunked_arr)); } -TEST_F(TestTakeKernel, TakeCCEmptyIndices) { - Datum dat = ChunkedArrayFromJSON(int8(), {"[]"}); - Datum idx = ChunkedArrayFromJSON(int32(), {}); - ASSERT_OK_AND_ASSIGN(auto out, Take(dat, idx)); - ValidateOutput(out); - AssertDatumsEqual(ChunkedArrayFromJSON(int8(), {"[]"}), out, true); -} - -TEST_F(TestTakeKernel, TakeACEmptyIndices) { - Datum dat = ArrayFromJSON(int8(), {"[]"}); - Datum idx = ChunkedArrayFromJSON(int32(), {}); - ASSERT_OK_AND_ASSIGN(auto out, Take(dat, idx)); - ValidateOutput(out); - AssertDatumsEqual(ChunkedArrayFromJSON(int8(), {"[]"}), out, true); +TEST_F(TestTakeKernel, TakeXCCEmptyIndices) { + auto expected = std::vector{"[]"}; + auto values = ArrayFromJSON(int8(), {"[1, 3, 3, 7]"}); + CheckTakeXCC(values, {"[]"}, expected); + auto chunked_values = std::make_shared(values); + CheckTakeXCC(chunked_values, {"[]"}, expected); } TEST_F(TestTakeKernel, DefaultOptions) { @@ -1329,18 +1587,25 @@ TEST_F(TestTakeKernel, DefaultOptions) { } TEST_F(TestTakeKernel, TakeBoolean) { - AssertTakeBoolean("[7, 8, 9]", "[]", "[]"); - AssertTakeBoolean("[true, false, true]", "[0, 1, 0]", "[true, false, true]"); - AssertTakeBoolean("[null, false, true]", "[0, 1, 0]", "[null, false, null]"); - AssertTakeBoolean("[true, false, true]", "[null, 1, 0]", "[null, false, true]"); + CheckTakeXA(boolean(), "[7, 8, 9]", "[]", "[]"); + CheckTakeXA(boolean(), "[true, false, true]", "[0, 1, 0]", "[true, false, true]"); + CheckTakeXA(boolean(), "[null, false, true]", "[0, 1, 0]", "[null, false, null]"); + CheckTakeXA(boolean(), "[true, false, true]", "[null, 1, 0]", "[null, false, true]"); - TestNoValidityBitmapButUnknownNullCount(boolean(), "[true, false, true]", "[1, 0, 0]"); + DoTestNoValidityBitmapButUnknownNullCount(boolean(), "[true, false, true]", + "[1, 0, 0]"); + const std::string kTrueFalseTrue = "[true, false, true]"; std::shared_ptr arr; + ASSERT_RAISES(IndexError, TakeAAA(boolean(), kTrueFalseTrue, "[0, 9, 0]").Value(&arr)); + ASSERT_RAISES(IndexError, TakeAAA(boolean(), kTrueFalseTrue, "[0, -1, 0]").Value(&arr)); + Datum chunked_arr; ASSERT_RAISES(IndexError, - TakeJSON(boolean(), "[true, false, true]", int8(), "[0, 9, 0]", &arr)); + TakeCAC(boolean(), {kTrueFalseTrue, kTrueFalseTrue}, "[0, 9, 0]") + .Value(&chunked_arr)); ASSERT_RAISES(IndexError, - TakeJSON(boolean(), "[true, false, true]", int8(), "[0, -1, 0]", &arr)); + TakeCAC(boolean(), {kTrueFalseTrue, kTrueFalseTrue}, "[0, -1, 0]") + .Value(&chunked_arr)); } TEST_F(TestTakeKernel, Temporal) { @@ -1349,8 +1614,8 @@ TEST_F(TestTakeKernel, Temporal) { this->TestNumericBasics(timestamp(TimeUnit::NANO, "Europe/Paris")); this->TestNumericBasics(duration(TimeUnit::SECOND)); this->TestNumericBasics(date32()); - CheckTake(date64(), "[0, 86400000, null]", "[null, 1, 1, 0]", - "[null, 86400000, 86400000, 0]"); + CheckTakeXA(date64(), "[0, 86400000, null]", "[null, 1, 1, 0]", + "[null, 86400000, 86400000, 0]"); } TEST_F(TestTakeKernel, Duration) { @@ -1363,177 +1628,184 @@ TEST_F(TestTakeKernel, Interval) { this->TestNumericBasics(month_interval()); auto type = day_time_interval(); - CheckTake(type, "[[1, -600], [2, 3000], null]", "[0, null, 2, 1]", - "[[1, -600], null, null, [2, 3000]]"); + CheckTakeXA(type, "[[1, -600], [2, 3000], null]", "[0, null, 2, 1]", + "[[1, -600], null, null, [2, 3000]]"); type = month_day_nano_interval(); - CheckTake(type, "[[1, -2, 34567890123456789], [2, 3, -34567890123456789], null]", - "[0, null, 2, 1]", - "[[1, -2, 34567890123456789], null, null, [2, 3, -34567890123456789]]"); + CheckTakeXA(type, "[[1, -2, 34567890123456789], [2, 3, -34567890123456789], null]", + "[0, null, 2, 1]", + "[[1, -2, 34567890123456789], null, null, [2, 3, -34567890123456789]]"); } template -class TestTakeKernelWithNumeric : public TestTakeKernelTyped { - protected: - void AssertTake(const std::string& values, const std::string& indices, - const std::string& expected) { - CheckTake(type_singleton(), values, indices, expected); - } - - std::shared_ptr type_singleton() { - return TypeTraits::type_singleton(); - } -}; +class TestTakeKernelWithNumeric : public TestTakeKernelTyped {}; TYPED_TEST_SUITE(TestTakeKernelWithNumeric, NumericArrowTypes); TYPED_TEST(TestTakeKernelWithNumeric, TakeNumeric) { - this->TestNumericBasics(this->type_singleton()); + this->TestNumericBasics(this->value_type()); } template class TestTakeKernelWithString : public TestTakeKernelTyped { public: - std::shared_ptr value_type() { - return TypeTraits::type_singleton(); - } - - void AssertTake(const std::string& values, const std::string& indices, - const std::string& expected) { - CheckTake(value_type(), values, indices, expected); - } - - void AssertTakeDictionary(const std::string& dictionary_values, - const std::string& dictionary_indices, - const std::string& indices, - const std::string& expected_indices) { - auto dict = ArrayFromJSON(value_type(), dictionary_values); - auto type = dictionary(int8(), value_type()); - ASSERT_OK_AND_ASSIGN(auto values, - DictionaryArray::FromArrays( - type, ArrayFromJSON(int8(), dictionary_indices), dict)); - ASSERT_OK_AND_ASSIGN( - auto expected, - DictionaryArray::FromArrays(type, ArrayFromJSON(int8(), expected_indices), dict)); - auto take_indices = ArrayFromJSON(int8(), indices); - AssertTakeArrays(values, take_indices, expected); + void AssertTakeXADictionary(const std::string& dictionary_values, + const std::string& dictionary_indices, + const std::string& indices, + const std::string& expected_indices) { + return CheckTakeXADictionary(this->value_type(), dictionary_values, + dictionary_indices, indices, expected_indices); } }; TYPED_TEST_SUITE(TestTakeKernelWithString, BaseBinaryArrowTypes); TYPED_TEST(TestTakeKernelWithString, TakeString) { - this->AssertTake(R"(["a", "b", "c"])", "[0, 1, 0]", R"(["a", "b", "a"])"); - this->AssertTake(R"([null, "b", "c"])", "[0, 1, 0]", "[null, \"b\", null]"); - this->AssertTake(R"(["a", "b", "c"])", "[null, 1, 0]", R"([null, "b", "a"])"); + this->CheckTakeXA(R"(["a", "b", "c"])", "[0, 1, 0]", R"(["a", "b", "a"])"); + this->CheckTakeXA(R"([null, "b", "c"])", "[0, 1, 0]", "[null, \"b\", null]"); + this->CheckTakeXA(R"(["a", "b", "c"])", "[null, 1, 0]", R"([null, "b", "a"])"); - this->TestNoValidityBitmapButUnknownNullCount(this->value_type(), R"(["a", "b", "c"])", - "[0, 1, 0]"); + this->TestNoValidityBitmapButUnknownNullCount(R"(["a", "b", "c"])", "[0, 1, 0]"); std::shared_ptr type = this->value_type(); + const std::string kABC = R"(["a", "b", "c"])"; std::shared_ptr arr; - ASSERT_RAISES(IndexError, - TakeJSON(type, R"(["a", "b", "c"])", int8(), "[0, 9, 0]", &arr)); - ASSERT_RAISES(IndexError, TakeJSON(type, R"(["a", "b", null, "ddd", "ee"])", int64(), - "[2, 5]", &arr)); + ASSERT_RAISES(IndexError, TakeAAA(type, kABC, "[0, 9, 0]").Value(&arr)); + ASSERT_RAISES(IndexError, TakeAAA(type, kABC, "[2, 5]").Value(&arr)); + Datum chunked_arr; + ASSERT_RAISES(IndexError, TakeCAC(type, {kABC, kABC}, "[0, 9, 0]").Value(&chunked_arr)); + ASSERT_RAISES(IndexError, TakeCAC(type, {kABC, kABC}, "[4, 10]").Value(&chunked_arr)); } TYPED_TEST(TestTakeKernelWithString, TakeDictionary) { auto dict = R"(["a", "b", "c", "d", "e"])"; - this->AssertTakeDictionary(dict, "[3, 4, 2]", "[0, 1, 0]", "[3, 4, 3]"); - this->AssertTakeDictionary(dict, "[null, 4, 2]", "[0, 1, 0]", "[null, 4, null]"); - this->AssertTakeDictionary(dict, "[3, 4, 2]", "[null, 1, 0]", "[null, 4, 3]"); + this->AssertTakeXADictionary(dict, "[3, 4, 2]", "[0, 1, 0]", "[3, 4, 3]"); + this->AssertTakeXADictionary(dict, "[null, 4, 2]", "[0, 1, 0]", "[null, 4, null]"); + this->AssertTakeXADictionary(dict, "[3, 4, 2]", "[null, 1, 0]", "[null, 4, 3]"); } class TestTakeKernelFSB : public TestTakeKernelTyped { public: - std::shared_ptr value_type() { return fixed_size_binary(3); } - - void AssertTake(const std::string& values, const std::string& indices, - const std::string& expected) { - CheckTake(value_type(), values, indices, expected); - } + std::shared_ptr value_type() const override { return fixed_size_binary(3); } }; TEST_F(TestTakeKernelFSB, TakeFixedSizeBinary) { - this->AssertTake(R"(["aaa", "bbb", "ccc"])", "[0, 1, 0]", R"(["aaa", "bbb", "aaa"])"); - this->AssertTake(R"([null, "bbb", "ccc"])", "[0, 1, 0]", "[null, \"bbb\", null]"); - this->AssertTake(R"(["aaa", "bbb", "ccc"])", "[null, 1, 0]", R"([null, "bbb", "aaa"])"); + const std::string kABC = R"(["aaa", "bbb", "ccc"])"; + this->CheckTakeXA(kABC, "[0, 1, 0]", R"(["aaa", "bbb", "aaa"])"); + this->CheckTakeXA(R"([null, "bbb", "ccc"])", "[0, 1, 0]", "[null, \"bbb\", null]"); + this->CheckTakeXA(kABC, "[null, 1, 0]", R"([null, "bbb", "aaa"])"); - this->TestNoValidityBitmapButUnknownNullCount(this->value_type(), - R"(["aaa", "bbb", "ccc"])", "[0, 1, 0]"); + this->TestNoValidityBitmapButUnknownNullCount(kABC, "[0, 1, 0]"); std::shared_ptr type = this->value_type(); + const std::string kABNullDE = R"(["aaa", "bbb", null, "ddd", "eee"])"; std::shared_ptr arr; + ASSERT_RAISES(IndexError, TakeAAA(type, kABC, "[0, 9, 0]").Value(&arr)); + ASSERT_RAISES(IndexError, TakeAAA(type, kABNullDE, "[2, 5]").Value(&arr)); + Datum chunked_arr; + ASSERT_RAISES(IndexError, TakeCAC(type, {kABC, kABC}, "[0, 9, 0]").Value(&chunked_arr)); ASSERT_RAISES(IndexError, - TakeJSON(type, R"(["aaa", "bbb", "ccc"])", int8(), "[0, 9, 0]", &arr)); - ASSERT_RAISES(IndexError, TakeJSON(type, R"(["aaa", "bbb", null, "ddd", "eee"])", - int64(), "[2, 5]", &arr)); + TakeCAC(type, {kABNullDE, kABC}, "[4, 10]").Value(&chunked_arr)); } -class TestTakeKernelWithList : public TestTakeKernelTyped {}; +using ListAndListViewArrowTypes = + ::testing::Types; + +template +class TestTakeKernelWithList : public TestTakeKernelTyped { + protected: + std::shared_ptr inner_type_ = nullptr; + + std::shared_ptr value_type(std::shared_ptr inner_type) const { + return std::make_shared(std::move(inner_type)); + } + + std::shared_ptr value_type() const override { + EXPECT_TRUE(inner_type_); + return value_type(inner_type_); + } + + std::vector> InnerListTypes() const { + return std::vector>{ + list(int32()), + large_list(int32()), + list_view(int32()), + large_list_view(int32()), + }; + } +}; + +TYPED_TEST_SUITE(TestTakeKernelWithList, ListAndListViewArrowTypes); -TEST_F(TestTakeKernelWithList, TakeListInt32) { +TYPED_TEST(TestTakeKernelWithList, TakeListInt32) { + this->inner_type_ = int32(); std::string list_json = "[[], [1,2], null, [3]]"; - for (auto& type : kListAndListViewTypes) { - CheckTake(type, list_json, "[]", "[]"); - CheckTake(type, list_json, "[3, 2, 1]", "[[3], null, [1,2]]"); - CheckTake(type, list_json, "[null, 3, 0]", "[null, [3], []]"); - CheckTake(type, list_json, "[null, null]", "[null, null]"); - CheckTake(type, list_json, "[3, 0, 0, 3]", "[[3], [], [], [3]]"); - CheckTake(type, list_json, "[0, 1, 2, 3]", list_json); - CheckTake(type, list_json, "[0, 0, 0, 0, 0, 0, 1]", - "[[], [], [], [], [], [], [1, 2]]"); + { + this->CheckTakeXA(list_json, "[]", "[]"); + this->CheckTakeXA(list_json, "[3, 2, 1]", "[[3], null, [1,2]]"); + this->CheckTakeXA(list_json, "[null, 3, 0]", "[null, [3], []]"); + this->CheckTakeXA(list_json, "[null, null]", "[null, null]"); + this->CheckTakeXA(list_json, "[3, 0, 0, 3]", "[[3], [], [], [3]]"); + this->CheckTakeXA(list_json, "[0, 1, 2, 3]", list_json); + this->CheckTakeXA(list_json, "[0, 0, 0, 0, 0, 0, 1]", + "[[], [], [], [], [], [], [1, 2]]"); - this->TestNoValidityBitmapButUnknownNullCount(type, "[[], [1,2], [3]]", "[0, 1, 0]"); + this->TestNoValidityBitmapButUnknownNullCount("[[], [1,2], [3]]", "[0, 1, 0]"); } } -TEST_F(TestTakeKernelWithList, TakeListListInt32) { +TYPED_TEST(TestTakeKernelWithList, TakeListListInt32) { std::string list_json = R"([ [], [[1], [2, null, 2], []], null, [[3, null], null] ])"; - for (auto& type : kNestedListAndListViewTypes) { - ARROW_SCOPED_TRACE("type = ", *type); - CheckTake(type, list_json, "[]", "[]"); - CheckTake(type, list_json, "[3, 2, 1]", R"([ + for (auto& inner_type : this->InnerListTypes()) { + this->inner_type_ = inner_type; + ARROW_SCOPED_TRACE("type = ", *this->value_type()); + this->CheckTakeXA(list_json, "[]", "[]"); + this->CheckTakeXA(list_json, "[3, 2, 1]", R"([ [[3, null], null], null, [[1], [2, null, 2], []] ])"); - CheckTake(type, list_json, "[null, 3, 0]", R"([ + this->CheckTakeXA(list_json, "[null, 3, 0]", R"([ null, [[3, null], null], [] ])"); - CheckTake(type, list_json, "[null, null]", "[null, null]"); - CheckTake(type, list_json, "[3, 0, 0, 3]", - "[[[3, null], null], [], [], [[3, null], null]]"); - CheckTake(type, list_json, "[0, 1, 2, 3]", list_json); - CheckTake(type, list_json, "[0, 0, 0, 0, 0, 0, 1]", - "[[], [], [], [], [], [], [[1], [2, null, 2], []]]"); + this->CheckTakeXA(list_json, "[null, null]", "[null, null]"); + this->CheckTakeXA(list_json, "[3, 0, 0, 3]", + "[[[3, null], null], [], [], [[3, null], null]]"); + this->CheckTakeXA(list_json, "[0, 1, 2, 3]", list_json); + this->CheckTakeXA(list_json, "[0, 0, 0, 0, 0, 0, 1]", + "[[], [], [], [], [], [], [[1], [2, null, 2], []]]"); this->TestNoValidityBitmapButUnknownNullCount( - type, "[[[1], [2, null, 2], []], [[3, null]]]", "[0, 1, 0]"); + "[[[1], [2, null, 2], []], [[3, null]]]", "[0, 1, 0]"); } } -class TestTakeKernelWithLargeList : public TestTakeKernelTyped {}; - -TEST_F(TestTakeKernelWithLargeList, TakeLargeListInt32) { +TYPED_TEST(TestTakeKernelWithList, TakeLargeListInt32) { + this->inner_type_ = int32(); std::string list_json = "[[], [1,2], null, [3]]"; - for (auto& type : kLargeListAndListViewTypes) { - ARROW_SCOPED_TRACE("type = ", *type); - CheckTake(type, list_json, "[]", "[]"); - CheckTake(type, list_json, "[null, 1, 2, 0]", "[null, [1,2], null, []]"); + { + ARROW_SCOPED_TRACE("type = ", *this->value_type()); + this->CheckTakeXA(list_json, "[]", "[]"); + this->CheckTakeXA(list_json, "[null, 1, 2, 0]", "[null, [1,2], null, []]"); } } class TestTakeKernelWithFixedSizeList : public TestTakeKernelTyped { protected: - void CheckTakeOnNestedLists(const std::shared_ptr& inner_type, - const std::vector& list_sizes, int64_t length) { + std::shared_ptr inner_type_ = nullptr; + + std::shared_ptr value_type() const override { + EXPECT_TRUE(inner_type_); + return fixed_size_list(inner_type_, 3); + } + + void CheckTakeXAOnNestedLists(const std::shared_ptr& inner_type, + const std::vector& list_sizes, int64_t length) { using NLG = ::arrow::util::internal::NestedListGenerator; // Create two equivalent lists: one as a FixedSizeList and another as a List. ASSERT_OK_AND_ASSIGN(auto fsl_list, @@ -1544,51 +1816,50 @@ class TestTakeKernelWithFixedSizeList : public TestTakeKernelTypedtype())); - DoCheckTake(fsl_list, indices, expected_fsl); + DoCheckTakeXA(fsl_list, indices, expected_fsl); } }; TEST_F(TestTakeKernelWithFixedSizeList, TakeFixedSizeListInt32) { + inner_type_ = int32(); std::string list_json = "[null, [1, null, 3], [4, 5, 6], [7, 8, null]]"; - CheckTake(fixed_size_list(int32(), 3), list_json, "[]", "[]"); - CheckTake(fixed_size_list(int32(), 3), list_json, "[3, 2, 1]", - "[[7, 8, null], [4, 5, 6], [1, null, 3]]"); - CheckTake(fixed_size_list(int32(), 3), list_json, "[null, 2, 0]", - "[null, [4, 5, 6], null]"); - CheckTake(fixed_size_list(int32(), 3), list_json, "[null, null]", "[null, null]"); - CheckTake(fixed_size_list(int32(), 3), list_json, "[3, 0, 0, 3]", - "[[7, 8, null], null, null, [7, 8, null]]"); - CheckTake(fixed_size_list(int32(), 3), list_json, "[0, 1, 2, 3]", list_json); + CheckTakeXA(list_json, "[]", "[]"); + CheckTakeXA(list_json, "[3, 2, 1]", "[[7, 8, null], [4, 5, 6], [1, null, 3]]"); + CheckTakeXA(list_json, "[null, 2, 0]", "[null, [4, 5, 6], null]"); + CheckTakeXA(list_json, "[null, null]", "[null, null]"); + CheckTakeXA(list_json, "[3, 0, 0, 3]", "[[7, 8, null], null, null, [7, 8, null]]"); + CheckTakeXA(list_json, "[0, 1, 2, 3]", list_json); // No nulls in inner list values trigger the use of FixedWidthTakeExec() in // FSLTakeExec() std::string no_nulls_list_json = "[[0, 0, 0], [1, 2, 3], [4, 5, 6], [7, 8, 9]]"; - CheckTake( - fixed_size_list(int32(), 3), no_nulls_list_json, "[2, 2, 2, 2, 2, 2, 1]", + CheckTakeXA( + no_nulls_list_json, "[2, 2, 2, 2, 2, 2, 1]", "[[4, 5, 6], [4, 5, 6], [4, 5, 6], [4, 5, 6], [4, 5, 6], [4, 5, 6], [1, 2, 3]]"); - this->TestNoValidityBitmapButUnknownNullCount(fixed_size_list(int32(), 3), - "[[1, null, 3], [4, 5, 6], [7, 8, null]]", + this->TestNoValidityBitmapButUnknownNullCount("[[1, null, 3], [4, 5, 6], [7, 8, null]]", "[0, 1, 0]"); } TEST_F(TestTakeKernelWithFixedSizeList, TakeFixedSizeListVarWidth) { + inner_type_ = utf8(); std::string list_json = R"([["zero", "one", ""], ["two", "", "three"], ["four", "five", "six"], ["seven", "eight", ""]])"; - CheckTake(fixed_size_list(utf8(), 3), list_json, "[]", "[]"); - CheckTake(fixed_size_list(utf8(), 3), list_json, "[3, 2, 1]", - R"([["seven", "eight", ""], ["four", "five", "six"], ["two", "", "three"]])"); - CheckTake(fixed_size_list(utf8(), 3), list_json, "[null, 2, 0]", - R"([null, ["four", "five", "six"], ["zero", "one", ""]])"); - CheckTake(fixed_size_list(utf8(), 3), list_json, R"([null, null])", "[null, null]"); - CheckTake( - fixed_size_list(utf8(), 3), list_json, "[3, 0, 0,3]", + CheckTakeXA(list_json, "[]", "[]"); + CheckTakeXA( + list_json, "[3, 2, 1]", + R"([["seven", "eight", ""], ["four", "five", "six"], ["two", "", "three"]])"); + CheckTakeXA(list_json, "[null, 2, 0]", + R"([null, ["four", "five", "six"], ["zero", "one", ""]])"); + CheckTakeXA(list_json, R"([null, null])", "[null, null]"); + CheckTakeXA( + list_json, "[3, 0, 0,3]", R"([["seven", "eight", ""], ["zero", "one", ""], ["zero", "one", ""], ["seven", "eight", ""]])"); - CheckTake(fixed_size_list(utf8(), 3), list_json, "[0, 1, 2, 3]", list_json); - CheckTake(fixed_size_list(utf8(), 3), list_json, "[2, 2, 2, 2, 2, 2, 1]", - R"([ + CheckTakeXA(list_json, "[0, 1, 2, 3]", list_json); + CheckTakeXA(list_json, "[2, 2, 2, 2, 2, 2, 1]", + R"([ ["four", "five", "six"], ["four", "five", "six"], ["four", "five", "six"], ["four", "five", "six"], ["four", "five", "six"], ["four", "five", "six"], @@ -1606,11 +1877,14 @@ TEST_F(TestTakeKernelWithFixedSizeList, TakeFixedSizeListModuloNesting) { NLG::VisitAllNestedListConfigurations( value_types, [this](const std::shared_ptr& inner_type, const std::vector& list_sizes) { - this->CheckTakeOnNestedLists(inner_type, list_sizes, /*length=*/5); + this->CheckTakeXAOnNestedLists(inner_type, list_sizes, /*length=*/5); }); } -class TestTakeKernelWithMap : public TestTakeKernelTyped {}; +class TestTakeKernelWithMap : public TestTakeKernelTyped { + protected: + std::shared_ptr value_type() const override { return map(utf8(), int32()); } +}; TEST_F(TestTakeKernelWithMap, TakeMapStringToInt32) { std::string map_json = R"([ @@ -1619,21 +1893,20 @@ TEST_F(TestTakeKernelWithMap, TakeMapStringToInt32) { [["cap", 8]], [] ])"; - CheckTake(map(utf8(), int32()), map_json, "[]", "[]"); - CheckTake(map(utf8(), int32()), map_json, "[3, 1, 3, 1, 3]", - "[[], null, [], null, []]"); - CheckTake(map(utf8(), int32()), map_json, "[2, 1, null]", R"([ + CheckTakeXA(map_json, "[]", "[]"); + CheckTakeXA(map_json, "[3, 1, 3, 1, 3]", "[[], null, [], null, []]"); + CheckTakeXA(map_json, "[2, 1, null]", R"([ [["cap", 8]], null, null ])"); - CheckTake(map(utf8(), int32()), map_json, "[2, 1, 0]", R"([ + CheckTakeXA(map_json, "[2, 1, 0]", R"([ [["cap", 8]], null, [["joe", 0], ["mark", null]] ])"); - CheckTake(map(utf8(), int32()), map_json, "[0, 1, 2, 3]", map_json); - CheckTake(map(utf8(), int32()), map_json, "[0, 0, 0, 0, 0, 0, 3]", R"([ + CheckTakeXA(map_json, "[0, 1, 2, 3]", map_json); + CheckTakeXA(map_json, "[0, 0, 0, 0, 0, 0, 3]", R"([ [["joe", 0], ["mark", null]], [["joe", 0], ["mark", null]], [["joe", 0], ["mark", null]], @@ -1644,31 +1917,34 @@ TEST_F(TestTakeKernelWithMap, TakeMapStringToInt32) { ])"); } -class TestTakeKernelWithStruct : public TestTakeKernelTyped {}; +class TestTakeKernelWithStruct : public TestTakeKernelTyped { + std::shared_ptr value_type() const override { + return struct_({field("a", int32()), field("b", utf8())}); + } +}; TEST_F(TestTakeKernelWithStruct, TakeStruct) { - auto struct_type = struct_({field("a", int32()), field("b", utf8())}); auto struct_json = R"([ null, {"a": 1, "b": ""}, {"a": 2, "b": "hello"}, {"a": 4, "b": "eh"} ])"; - CheckTake(struct_type, struct_json, "[]", "[]"); - CheckTake(struct_type, struct_json, "[3, 1, 3, 1, 3]", R"([ + this->CheckTakeXA(struct_json, "[]", "[]"); + this->CheckTakeXA(struct_json, "[3, 1, 3, 1, 3]", R"([ {"a": 4, "b": "eh"}, {"a": 1, "b": ""}, {"a": 4, "b": "eh"}, {"a": 1, "b": ""}, {"a": 4, "b": "eh"} ])"); - CheckTake(struct_type, struct_json, "[3, 1, 0]", R"([ + this->CheckTakeXA(struct_json, "[3, 1, 0]", R"([ {"a": 4, "b": "eh"}, {"a": 1, "b": ""}, null ])"); - CheckTake(struct_type, struct_json, "[0, 1, 2, 3]", struct_json); - CheckTake(struct_type, struct_json, "[0, 2, 2, 2, 2, 2, 2]", R"([ + this->CheckTakeXA(struct_json, "[0, 1, 2, 3]", struct_json); + this->CheckTakeXA(struct_json, "[0, 2, 2, 2, 2, 2, 2]", R"([ null, {"a": 2, "b": "hello"}, {"a": 2, "b": "hello"}, @@ -1678,16 +1954,30 @@ TEST_F(TestTakeKernelWithStruct, TakeStruct) { {"a": 2, "b": "hello"} ])"); - this->TestNoValidityBitmapButUnknownNullCount( - struct_type, R"([{"a": 1}, {"a": 2, "b": "hello"}])", "[0, 1, 0]"); + this->TestNoValidityBitmapButUnknownNullCount(R"([{"a": 1}, {"a": 2, "b": "hello"}])", + "[0, 1, 0]"); } -class TestTakeKernelWithUnion : public TestTakeKernelTyped {}; +template +class TestTakeKernelWithUnion : public TestTakeKernelTyped { + protected: + std::shared_ptr value_type() const override { + return std::make_shared( + FieldVector{ + field("a", int32()), + field("b", utf8()), + }, + std::vector{ + 2, + 5, + }); + } +}; + +TYPED_TEST_SUITE(TestTakeKernelWithUnion, UnionArrowTypes); -TEST_F(TestTakeKernelWithUnion, TakeUnion) { - for (const auto& union_type : - {dense_union({field("a", int32()), field("b", utf8())}, {2, 5}), - sparse_union({field("a", int32()), field("b", utf8())}, {2, 5})}) { +TYPED_TEST(TestTakeKernelWithUnion, TakeUnion) { + { auto union_json = R"([ [2, 222], [2, null], @@ -1697,22 +1987,22 @@ TEST_F(TestTakeKernelWithUnion, TakeUnion) { [2, 111], [5, null] ])"; - CheckTake(union_type, union_json, "[]", "[]"); - CheckTake(union_type, union_json, "[3, 0, 3, 0, 3]", R"([ + this->CheckTakeXA(union_json, "[]", "[]"); + this->CheckTakeXA(union_json, "[3, 0, 3, 0, 3]", R"([ [5, "eh"], [2, 222], [5, "eh"], [2, 222], [5, "eh"] ])"); - CheckTake(union_type, union_json, "[4, 2, 0, 6]", R"([ + this->CheckTakeXA(union_json, "[4, 2, 0, 6]", R"([ [2, null], [5, "hello"], [2, 222], [5, null] ])"); - CheckTake(union_type, union_json, "[0, 1, 2, 3, 4, 5, 6]", union_json); - CheckTake(union_type, union_json, "[1, 2, 2, 2, 2, 2, 2]", R"([ + this->CheckTakeXA(union_json, "[0, 1, 2, 3, 4, 5, 6]", union_json); + this->CheckTakeXA(union_json, "[1, 2, 2, 2, 2, 2, 2]", R"([ [2, null], [5, "hello"], [5, "hello"], @@ -1721,7 +2011,7 @@ TEST_F(TestTakeKernelWithUnion, TakeUnion) { [5, "hello"], [5, "hello"] ])"); - CheckTake(union_type, union_json, "[0, null, 1, null, 2, 2, 2]", R"([ + this->CheckTakeXA(union_json, "[0, null, 1, null, 2, 2, 2]", R"([ [2, 222], [2, null], [2, null], @@ -1735,72 +2025,58 @@ TEST_F(TestTakeKernelWithUnion, TakeUnion) { class TestPermutationsWithTake : public ::testing::Test { protected: - void DoTake(const Int16Array& values, const Int16Array& indices, - std::shared_ptr* out) { - ASSERT_OK_AND_ASSIGN(std::shared_ptr boxed_out, Take(values, indices)); + Result> DoTakeAAA( + const std::shared_ptr& values, + const std::shared_ptr& indices) { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr boxed_out, TakeAAA(*values, *indices)); ValidateOutput(boxed_out); - *out = checked_pointer_cast(std::move(boxed_out)); + return checked_pointer_cast(std::move(boxed_out)); } - std::shared_ptr DoTake(const Int16Array& values, - const Int16Array& indices) { - std::shared_ptr out; - DoTake(values, indices, &out); - return out; - } - - std::shared_ptr DoTakeN(uint64_t n, std::shared_ptr array) { + Result> DoTakeN(uint64_t n, + std::shared_ptr array) { auto power_of_2 = array; - array = Identity(array->length()); + ARROW_ASSIGN_OR_RAISE(array, Identity(array->length())); while (n != 0) { if (n & 1) { - array = DoTake(*array, *power_of_2); + ARROW_ASSIGN_OR_RAISE(array, DoTakeAAA(array, power_of_2)); } - power_of_2 = DoTake(*power_of_2, *power_of_2); + ARROW_ASSIGN_OR_RAISE(power_of_2, DoTakeAAA(power_of_2, power_of_2)); n >>= 1; } return array; } template - void Shuffle(const Int16Array& array, Rng& gen, std::shared_ptr* shuffled) { + Result> Shuffle(const Int16Array& array, Rng& gen) { auto byte_length = array.length() * sizeof(int16_t); - ASSERT_OK_AND_ASSIGN(auto data, array.values()->CopySlice(0, byte_length)); + ARROW_ASSIGN_OR_RAISE(auto data, array.values()->CopySlice(0, byte_length)); auto mutable_data = reinterpret_cast(data->mutable_data()); std::shuffle(mutable_data, mutable_data + array.length(), gen); - shuffled->reset(new Int16Array(array.length(), data)); - } - - template - std::shared_ptr Shuffle(const Int16Array& array, Rng& gen) { - std::shared_ptr out; - Shuffle(array, gen, &out); - return out; + return std::make_shared(array.length(), data); } - void Identity(int64_t length, std::shared_ptr* identity) { + Result> Identity(int64_t length) { + std::shared_ptr identity; Int16Builder identity_builder; - ASSERT_OK(identity_builder.Resize(length)); + RETURN_NOT_OK(identity_builder.Resize(length)); for (int16_t i = 0; i < length; ++i) { identity_builder.UnsafeAppend(i); } - ASSERT_OK(identity_builder.Finish(identity)); - } - - std::shared_ptr Identity(int64_t length) { - std::shared_ptr out; - Identity(length, &out); - return out; + RETURN_NOT_OK(identity_builder.Finish(&identity)); + return identity; } - std::shared_ptr Inverse(const std::shared_ptr& permutation) { + Result> Inverse( + const std::shared_ptr& permutation) { auto length = static_cast(permutation->length()); std::vector cycle_lengths(length + 1, false); auto permutation_to_the_i = permutation; for (int16_t cycle_length = 1; cycle_length <= length; ++cycle_length) { cycle_lengths[cycle_length] = HasTrivialCycle(*permutation_to_the_i); - permutation_to_the_i = DoTake(*permutation, *permutation_to_the_i); + ARROW_ASSIGN_OR_RAISE(permutation_to_the_i, + DoTakeAAA(permutation, permutation_to_the_i)); } uint64_t cycle_to_identity_length = 1; @@ -1836,42 +2112,18 @@ TEST_F(TestPermutationsWithTake, InvertPermutation) { for (auto seed : std::vector({0, kRandomSeed, kRandomSeed * 2 - 1})) { std::default_random_engine gen(seed); for (int16_t length = 0; length < 1 << 10; ++length) { - auto identity = Identity(length); - auto permutation = Shuffle(*identity, gen); - auto inverse = Inverse(permutation); + ASSERT_OK_AND_ASSIGN(auto identity, Identity(length)); + ASSERT_OK_AND_ASSIGN(auto permutation, Shuffle(*identity, gen)); + ASSERT_OK_AND_ASSIGN(auto inverse, Inverse(permutation)); if (inverse == nullptr) { break; } - ASSERT_TRUE(DoTake(*inverse, *permutation)->Equals(identity)); + DoCheckTakeXA(inverse, permutation, identity); } } } -class TestTakeKernelWithRecordBatch : public TestTakeKernelTyped { - public: - void AssertTake(const std::shared_ptr& schm, const std::string& batch_json, - const std::string& indices, const std::string& expected_batch) { - std::shared_ptr actual; - - for (auto index_type : {int8(), uint32()}) { - ASSERT_OK(TakeJSON(schm, batch_json, index_type, indices, &actual)); - ValidateOutput(actual); - ASSERT_BATCHES_EQUAL(*RecordBatchFromJSON(schm, expected_batch), *actual); - } - } - - Status TakeJSON(const std::shared_ptr& schm, const std::string& batch_json, - const std::shared_ptr& index_type, const std::string& indices, - std::shared_ptr* out) { - auto batch = RecordBatchFromJSON(schm, batch_json); - ARROW_ASSIGN_OR_RAISE(Datum result, - Take(Datum(batch), Datum(ArrayFromJSON(index_type, indices)))); - *out = result.record_batch(); - return Status::OK(); - } -}; - -TEST_F(TestTakeKernelWithRecordBatch, TakeRecordBatch) { +TEST(TestTakeKernelWithRecordBatch, TakeRecordBatch) { std::vector> fields = {field("a", int32()), field("b", utf8())}; auto schm = schema(fields); @@ -1881,21 +2133,21 @@ TEST_F(TestTakeKernelWithRecordBatch, TakeRecordBatch) { {"a": 2, "b": "hello"}, {"a": 4, "b": "eh"} ])"; - this->AssertTake(schm, struct_json, "[]", "[]"); - this->AssertTake(schm, struct_json, "[3, 1, 3, 1, 3]", R"([ + AssertTakeRAR(schm, struct_json, "[]", "[]"); + AssertTakeRAR(schm, struct_json, "[3, 1, 3, 1, 3]", R"([ {"a": 4, "b": "eh"}, {"a": 1, "b": ""}, {"a": 4, "b": "eh"}, {"a": 1, "b": ""}, {"a": 4, "b": "eh"} ])"); - this->AssertTake(schm, struct_json, "[3, 1, 0]", R"([ + AssertTakeRAR(schm, struct_json, "[3, 1, 0]", R"([ {"a": 4, "b": "eh"}, {"a": 1, "b": ""}, {"a": null, "b": "yo"} ])"); - this->AssertTake(schm, struct_json, "[0, 1, 2, 3]", struct_json); - this->AssertTake(schm, struct_json, "[0, 2, 2, 2, 2, 2, 2]", R"([ + AssertTakeRAR(schm, struct_json, "[0, 1, 2, 3]", struct_json); + AssertTakeRAR(schm, struct_json, "[0, 2, 2, 2, 2, 2, 2]", R"([ {"a": null, "b": "yo"}, {"a": 2, "b": "hello"}, {"a": 2, "b": "hello"}, @@ -1906,115 +2158,41 @@ TEST_F(TestTakeKernelWithRecordBatch, TakeRecordBatch) { ])"); } -class TestTakeKernelWithChunkedArray : public TestTakeKernelTyped { - public: - void AssertTake(const std::shared_ptr& type, - const std::vector& values, const std::string& indices, - const std::vector& expected) { - std::shared_ptr actual; - ASSERT_OK(this->TakeWithArray(type, values, indices, &actual)); - ValidateOutput(actual); - AssertChunkedEqual(*ChunkedArrayFromJSON(type, expected), *actual); +TEST(TestTakeKernelWithChunkedIndices, TakeChunkedArray) { + for (auto& ty : {boolean(), int8(), uint64()}) { + AssertTakeCAC(ty, {"[]"}, "[]", {"[]"}); + AssertTakeCCC(ty, {}, {}, {}); + AssertTakeCCC(ty, {}, {"[]"}, {"[]"}); + AssertTakeCCC(ty, {}, {"[null]"}, {"[null]"}); + AssertTakeCCC(ty, {"[]"}, {}, {}); + AssertTakeCCC(ty, {"[]"}, {"[]"}, {"[]"}); + AssertTakeCCC(ty, {"[]"}, {"[null]"}, {"[null]"}); } - void AssertChunkedTake(const std::shared_ptr& type, - const std::vector& values, - const std::vector& indices, - const std::vector& expected) { - std::shared_ptr actual; - ASSERT_OK(this->TakeWithChunkedArray(type, values, indices, &actual)); - ValidateOutput(actual); - AssertChunkedEqual(*ChunkedArrayFromJSON(type, expected), *actual); - } + AssertTakeCAC(boolean(), {"[true]", "[false, true]"}, "[0, 1, 0, 2]", + {"[true, false, true, true]"}); + AssertTakeCCC(boolean(), {"[false]", "[true, false]"}, {"[0, 1, 0]", "[]", "[2]"}, + {"[false, true, false]", "[]", "[false]"}); + AssertTakeCAC(boolean(), {"[true]", "[false, true]"}, "[2, 1]", {"[true, false]"}); - Status TakeWithArray(const std::shared_ptr& type, - const std::vector& values, const std::string& indices, - std::shared_ptr* out) { - ARROW_ASSIGN_OR_RAISE(Datum result, Take(ChunkedArrayFromJSON(type, values), - ArrayFromJSON(int8(), indices))); - *out = result.chunked_array(); - return Status::OK(); - } + Datum chunked_arr; + for (auto& int_ty : SignedIntTypes()) { + AssertTakeCAC(int_ty, {"[7]", "[8, 9]"}, "[0, 1, 0, 2]", {"[7, 8, 7, 9]"}); + AssertTakeCCC(int_ty, {"[7]", "[8, 9]"}, {"[0, 1, 0]", "[]", "[2]"}, + {"[7, 8, 7]", "[]", "[9]"}); + AssertTakeCAC(int_ty, {"[7]", "[8, 9]"}, "[2, 1]", {"[9, 8]"}); - Status TakeWithChunkedArray(const std::shared_ptr& type, - const std::vector& values, - const std::vector& indices, - std::shared_ptr* out) { - ARROW_ASSIGN_OR_RAISE(Datum result, Take(ChunkedArrayFromJSON(type, values), - ChunkedArrayFromJSON(int8(), indices))); - *out = result.chunked_array(); - return Status::OK(); + ASSERT_RAISES(IndexError, + TakeCAC(int_ty, {"[7]", "[8, 9]"}, "[0, 5]").Value(&chunked_arr)); + ASSERT_RAISES( + IndexError, + TakeCCC(int_ty, {"[7]", "[8, 9]"}, {"[0, 1, 0]", "[5, 1]"}).Value(&chunked_arr)); + ASSERT_RAISES(IndexError, TakeCCC(int_ty, {}, {"[0]"}).Value(&chunked_arr)); + ASSERT_RAISES(IndexError, TakeCCC(int_ty, {"[]"}, {"[0]"}).Value(&chunked_arr)); } -}; - -TEST_F(TestTakeKernelWithChunkedArray, TakeChunkedArray) { - this->AssertTake(int8(), {"[]"}, "[]", {"[]"}); - this->AssertChunkedTake(int8(), {}, {}, {}); - this->AssertChunkedTake(int8(), {}, {"[]"}, {"[]"}); - this->AssertChunkedTake(int8(), {}, {"[null]"}, {"[null]"}); - this->AssertChunkedTake(int8(), {"[]"}, {}, {}); - this->AssertChunkedTake(int8(), {"[]"}, {"[]"}, {"[]"}); - this->AssertChunkedTake(int8(), {"[]"}, {"[null]"}, {"[null]"}); - - this->AssertTake(int8(), {"[7]", "[8, 9]"}, "[0, 1, 0, 2]", {"[7, 8, 7, 9]"}); - this->AssertChunkedTake(int8(), {"[7]", "[8, 9]"}, {"[0, 1, 0]", "[]", "[2]"}, - {"[7, 8, 7]", "[]", "[9]"}); - this->AssertTake(int8(), {"[7]", "[8, 9]"}, "[2, 1]", {"[9, 8]"}); - - std::shared_ptr arr; - ASSERT_RAISES(IndexError, - this->TakeWithArray(int8(), {"[7]", "[8, 9]"}, "[0, 5]", &arr)); - ASSERT_RAISES(IndexError, this->TakeWithChunkedArray(int8(), {"[7]", "[8, 9]"}, - {"[0, 1, 0]", "[5, 1]"}, &arr)); - ASSERT_RAISES(IndexError, this->TakeWithChunkedArray(int8(), {}, {"[0]"}, &arr)); - ASSERT_RAISES(IndexError, this->TakeWithChunkedArray(int8(), {"[]"}, {"[0]"}, &arr)); } -class TestTakeKernelWithTable : public TestTakeKernelTyped { - public: - void AssertTake(const std::shared_ptr& schm, - const std::vector& table_json, const std::string& filter, - const std::vector& expected_table) { - std::shared_ptr
actual; - - ASSERT_OK(this->TakeWithArray(schm, table_json, filter, &actual)); - ValidateOutput(actual); - ASSERT_TABLES_EQUAL(*TableFromJSON(schm, expected_table), *actual); - } - - void AssertChunkedTake(const std::shared_ptr& schm, - const std::vector& table_json, - const std::vector& filter, - const std::vector& expected_table) { - std::shared_ptr
actual; - - ASSERT_OK(this->TakeWithChunkedArray(schm, table_json, filter, &actual)); - ValidateOutput(actual); - ASSERT_TABLES_EQUAL(*TableFromJSON(schm, expected_table), *actual); - } - - Status TakeWithArray(const std::shared_ptr& schm, - const std::vector& values, const std::string& indices, - std::shared_ptr
* out) { - ARROW_ASSIGN_OR_RAISE(Datum result, Take(Datum(TableFromJSON(schm, values)), - Datum(ArrayFromJSON(int8(), indices)))); - *out = result.table(); - return Status::OK(); - } - - Status TakeWithChunkedArray(const std::shared_ptr& schm, - const std::vector& values, - const std::vector& indices, - std::shared_ptr
* out) { - ARROW_ASSIGN_OR_RAISE(Datum result, - Take(Datum(TableFromJSON(schm, values)), - Datum(ChunkedArrayFromJSON(int8(), indices)))); - *out = result.table(); - return Status::OK(); - } -}; - -TEST_F(TestTakeKernelWithTable, TakeTable) { +TEST(TestTakeKernelWithTable, TakeTable) { std::vector> fields = {field("a", int32()), field("b", utf8())}; auto schm = schema(fields); @@ -2022,11 +2200,12 @@ TEST_F(TestTakeKernelWithTable, TakeTable) { "[{\"a\": null, \"b\": \"yo\"},{\"a\": 1, \"b\": \"\"}]", "[{\"a\": 2, \"b\": \"hello\"},{\"a\": 4, \"b\": \"eh\"}]"}; - this->AssertTake(schm, table_json, "[]", {"[]"}); + AssertTakeTAT(schm, table_json, "[]", {"[]"}); std::vector expected_310 = { - "[{\"a\": 4, \"b\": \"eh\"},{\"a\": 1, \"b\": \"\"},{\"a\": null, \"b\": \"yo\"}]"}; - this->AssertTake(schm, table_json, "[3, 1, 0]", expected_310); - this->AssertChunkedTake(schm, table_json, {"[0, 1]", "[2, 3]"}, table_json); + "[{\"a\": 4, \"b\": \"eh\"},{\"a\": 1, \"b\": \"\"},{\"a\": null, \"b\": " + "\"yo\"}]"}; + AssertTakeTAT(schm, table_json, "[3, 1, 0]", expected_310); + AssertTakeTCT(schm, table_json, {"[0, 1]", "[2, 3]"}, table_json); } TEST(TestTakeMetaFunction, ArityChecking) { @@ -2066,14 +2245,14 @@ void CheckTakeRandom(const std::shared_ptr& values, int64_t indices_lengt max_index, null_probability); auto indices_no_nulls = rand->Numeric( indices_length, static_cast(0), max_index, /*null_probability=*/0.0); - ValidateTake(values, indices); - ValidateTake(values, indices_no_nulls); + ValidateTakeXA(values, indices); + ValidateTakeXA(values, indices_no_nulls); // Sliced indices array if (indices_length >= 2) { indices = indices->Slice(1, indices_length - 2); indices_no_nulls = indices_no_nulls->Slice(1, indices_length - 2); - ValidateTake(values, indices); - ValidateTake(values, indices_no_nulls); + ValidateTakeXA(values, indices); + ValidateTakeXA(values, indices_no_nulls); } } From 0fbea66a4bcdec2dc6bbffd2877c143e47d1653d Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Thu, 25 Jul 2024 15:42:07 -0400 Subject: [PATCH 21/73] GH-42085: [Python] Test FlightStreamReader iterator (#42086) ### Rationale for this change `FlightStreamReader` correctly implemented iterator functionality in https://github.com/apache/arrow/pull/37097/files#diff-0ed358f5d42920d7f94cc500791976a2c158c4d72f4a6b231393534b2d13683bR993. Let's update tests to use this functionality. ### What changes are included in this PR? * Tests are updated ### Are these changes tested? Yes ### Are there any user-facing changes? No * GitHub Issue: #42085 Authored-by: Dane Pitkin Signed-off-by: Dane Pitkin --- python/pyarrow/tests/test_flight.py | 58 ++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 17 deletions(-) diff --git a/python/pyarrow/tests/test_flight.py b/python/pyarrow/tests/test_flight.py index 4853dafc76c72..832c6a2dbdf9f 100644 --- a/python/pyarrow/tests/test_flight.py +++ b/python/pyarrow/tests/test_flight.py @@ -180,20 +180,16 @@ def do_get(self, context, ticket): def do_put(self, context, descriptor, reader, writer): counter = 0 expected_data = [-10, -5, 0, 5, 10] - while True: - try: - batch, buf = reader.read_chunk() - assert batch.equals(pa.RecordBatch.from_arrays( - [pa.array([expected_data[counter]])], - ['a'] - )) - assert buf is not None - client_counter, = struct.unpack(' Date: Thu, 25 Jul 2024 15:49:52 -0400 Subject: [PATCH 22/73] GH-43169: [Swift] Add StructArray to ArrowReader (#43335) ### Rationale for this change Structs have been added for Swift but currently the ArrowReader does not support them. This PR adds the ArrowReader support ### What changes are included in this PR? Adding StructArray to ArrowReader ### Are these changes tested? The next PR for the ArrowWriter will include a test for reading and writing Structs. * GitHub Issue: #43169 Authored-by: Alva Bandy Signed-off-by: Sutou Kouhei --- .../Arrow/Sources/Arrow/ArrowCImporter.swift | 3 +- swift/Arrow/Sources/Arrow/ArrowReader.swift | 199 ++++++++++++------ .../Sources/Arrow/ArrowReaderHelper.swift | 59 +++++- swift/Arrow/Tests/ArrowTests/ArrayTests.swift | 2 +- 4 files changed, 194 insertions(+), 69 deletions(-) diff --git a/swift/Arrow/Sources/Arrow/ArrowCImporter.swift b/swift/Arrow/Sources/Arrow/ArrowCImporter.swift index f55077ef3dc95..e65d78d730be7 100644 --- a/swift/Arrow/Sources/Arrow/ArrowCImporter.swift +++ b/swift/Arrow/Sources/Arrow/ArrowCImporter.swift @@ -153,7 +153,8 @@ public class ArrowCImporter { } } - switch makeArrayHolder(arrowField, buffers: arrowBuffers, nullCount: nullCount) { + switch makeArrayHolder(arrowField, buffers: arrowBuffers, + nullCount: nullCount, children: nil, rbLength: 0) { case .success(let holder): return .success(ImportArrayHolder(holder, cArrayPtr: cArrayPtr)) case .failure(let err): diff --git a/swift/Arrow/Sources/Arrow/ArrowReader.swift b/swift/Arrow/Sources/Arrow/ArrowReader.swift index 237f22dc979e3..ae187e22eef70 100644 --- a/swift/Arrow/Sources/Arrow/ArrowReader.swift +++ b/swift/Arrow/Sources/Arrow/ArrowReader.swift @@ -21,14 +21,46 @@ import Foundation let FILEMARKER = "ARROW1" let CONTINUATIONMARKER = -1 -public class ArrowReader { - private struct DataLoadInfo { +public class ArrowReader { // swiftlint:disable:this type_body_length + private class RecordBatchData { + let schema: org_apache_arrow_flatbuf_Schema let recordBatch: org_apache_arrow_flatbuf_RecordBatch - let field: org_apache_arrow_flatbuf_Field - let nodeIndex: Int32 - let bufferIndex: Int32 + private var fieldIndex: Int32 = 0 + private var nodeIndex: Int32 = 0 + private var bufferIndex: Int32 = 0 + init(_ recordBatch: org_apache_arrow_flatbuf_RecordBatch, + schema: org_apache_arrow_flatbuf_Schema) { + self.recordBatch = recordBatch + self.schema = schema + } + + func nextNode() -> org_apache_arrow_flatbuf_FieldNode? { + if nodeIndex >= self.recordBatch.nodesCount {return nil} + defer {nodeIndex += 1} + return self.recordBatch.nodes(at: nodeIndex) + } + + func nextBuffer() -> org_apache_arrow_flatbuf_Buffer? { + if bufferIndex >= self.recordBatch.buffersCount {return nil} + defer {bufferIndex += 1} + return self.recordBatch.buffers(at: bufferIndex) + } + + func nextField() -> org_apache_arrow_flatbuf_Field? { + if fieldIndex >= self.schema.fieldsCount {return nil} + defer {fieldIndex += 1} + return self.schema.fields(at: fieldIndex) + } + + func isDone() -> Bool { + return nodeIndex >= self.recordBatch.nodesCount + } + } + + private struct DataLoadInfo { let fileData: Data let messageOffset: Int64 + var batchData: RecordBatchData } public class ArrowReaderResult { @@ -54,49 +86,104 @@ public class ArrowReader { return .success(builder.finish()) } - private func loadPrimitiveData(_ loadInfo: DataLoadInfo) -> Result { - do { - let node = loadInfo.recordBatch.nodes(at: loadInfo.nodeIndex)! - let nullLength = UInt(ceil(Double(node.length) / 8)) - try validateBufferIndex(loadInfo.recordBatch, index: loadInfo.bufferIndex) - let nullBuffer = loadInfo.recordBatch.buffers(at: loadInfo.bufferIndex)! - let arrowNullBuffer = makeBuffer(nullBuffer, fileData: loadInfo.fileData, - length: nullLength, messageOffset: loadInfo.messageOffset) - try validateBufferIndex(loadInfo.recordBatch, index: loadInfo.bufferIndex + 1) - let valueBuffer = loadInfo.recordBatch.buffers(at: loadInfo.bufferIndex + 1)! - let arrowValueBuffer = makeBuffer(valueBuffer, fileData: loadInfo.fileData, - length: UInt(node.length), messageOffset: loadInfo.messageOffset) - return makeArrayHolder(loadInfo.field, buffers: [arrowNullBuffer, arrowValueBuffer], - nullCount: UInt(node.nullCount)) - } catch let error as ArrowError { - return .failure(error) - } catch { - return .failure(.unknownError("\(error)")) + private func loadStructData(_ loadInfo: DataLoadInfo, + field: org_apache_arrow_flatbuf_Field) + -> Result { + guard let node = loadInfo.batchData.nextNode() else { + return .failure(.invalid("Node not found")) + } + + guard let nullBuffer = loadInfo.batchData.nextBuffer() else { + return .failure(.invalid("Null buffer not found")) + } + + let nullLength = UInt(ceil(Double(node.length) / 8)) + let arrowNullBuffer = makeBuffer(nullBuffer, fileData: loadInfo.fileData, + length: nullLength, messageOffset: loadInfo.messageOffset) + var children = [ArrowData]() + for index in 0.. Result { - let node = loadInfo.recordBatch.nodes(at: loadInfo.nodeIndex)! - do { - let nullLength = UInt(ceil(Double(node.length) / 8)) - try validateBufferIndex(loadInfo.recordBatch, index: loadInfo.bufferIndex) - let nullBuffer = loadInfo.recordBatch.buffers(at: loadInfo.bufferIndex)! - let arrowNullBuffer = makeBuffer(nullBuffer, fileData: loadInfo.fileData, - length: nullLength, messageOffset: loadInfo.messageOffset) - try validateBufferIndex(loadInfo.recordBatch, index: loadInfo.bufferIndex + 1) - let offsetBuffer = loadInfo.recordBatch.buffers(at: loadInfo.bufferIndex + 1)! - let arrowOffsetBuffer = makeBuffer(offsetBuffer, fileData: loadInfo.fileData, - length: UInt(node.length), messageOffset: loadInfo.messageOffset) - try validateBufferIndex(loadInfo.recordBatch, index: loadInfo.bufferIndex + 2) - let valueBuffer = loadInfo.recordBatch.buffers(at: loadInfo.bufferIndex + 2)! - let arrowValueBuffer = makeBuffer(valueBuffer, fileData: loadInfo.fileData, - length: UInt(node.length), messageOffset: loadInfo.messageOffset) - return makeArrayHolder(loadInfo.field, buffers: [arrowNullBuffer, arrowOffsetBuffer, arrowValueBuffer], - nullCount: UInt(node.nullCount)) - } catch let error as ArrowError { - return .failure(error) - } catch { - return .failure(.unknownError("\(error)")) + private func loadPrimitiveData( + _ loadInfo: DataLoadInfo, + field: org_apache_arrow_flatbuf_Field) + -> Result { + guard let node = loadInfo.batchData.nextNode() else { + return .failure(.invalid("Node not found")) + } + + guard let nullBuffer = loadInfo.batchData.nextBuffer() else { + return .failure(.invalid("Null buffer not found")) + } + + guard let valueBuffer = loadInfo.batchData.nextBuffer() else { + return .failure(.invalid("Value buffer not found")) + } + + let nullLength = UInt(ceil(Double(node.length) / 8)) + let arrowNullBuffer = makeBuffer(nullBuffer, fileData: loadInfo.fileData, + length: nullLength, messageOffset: loadInfo.messageOffset) + let arrowValueBuffer = makeBuffer(valueBuffer, fileData: loadInfo.fileData, + length: UInt(node.length), messageOffset: loadInfo.messageOffset) + return makeArrayHolder(field, buffers: [arrowNullBuffer, arrowValueBuffer], + nullCount: UInt(node.nullCount), children: nil, + rbLength: UInt(loadInfo.batchData.recordBatch.length)) + } + + private func loadVariableData( + _ loadInfo: DataLoadInfo, + field: org_apache_arrow_flatbuf_Field) + -> Result { + guard let node = loadInfo.batchData.nextNode() else { + return .failure(.invalid("Node not found")) + } + + guard let nullBuffer = loadInfo.batchData.nextBuffer() else { + return .failure(.invalid("Null buffer not found")) + } + + guard let offsetBuffer = loadInfo.batchData.nextBuffer() else { + return .failure(.invalid("Offset buffer not found")) + } + + guard let valueBuffer = loadInfo.batchData.nextBuffer() else { + return .failure(.invalid("Value buffer not found")) + } + + let nullLength = UInt(ceil(Double(node.length) / 8)) + let arrowNullBuffer = makeBuffer(nullBuffer, fileData: loadInfo.fileData, + length: nullLength, messageOffset: loadInfo.messageOffset) + let arrowOffsetBuffer = makeBuffer(offsetBuffer, fileData: loadInfo.fileData, + length: UInt(node.length), messageOffset: loadInfo.messageOffset) + let arrowValueBuffer = makeBuffer(valueBuffer, fileData: loadInfo.fileData, + length: UInt(node.length), messageOffset: loadInfo.messageOffset) + return makeArrayHolder(field, buffers: [arrowNullBuffer, arrowOffsetBuffer, arrowValueBuffer], + nullCount: UInt(node.nullCount), children: nil, + rbLength: UInt(loadInfo.batchData.recordBatch.length)) + } + + private func loadField( + _ loadInfo: DataLoadInfo, + field: org_apache_arrow_flatbuf_Field) + -> Result { + if isNestedType(field.typeType) { + return loadStructData(loadInfo, field: field) + } else if isFixedPrimitive(field.typeType) { + return loadPrimitiveData(loadInfo, field: field) + } else { + return loadVariableData(loadInfo, field: field) } } @@ -107,23 +194,17 @@ public class ArrowReader { data: Data, messageEndOffset: Int64 ) -> Result { - let nodesCount = recordBatch.nodesCount - var bufferIndex: Int32 = 0 var columns: [ArrowArrayHolder] = [] - for nodeIndex in 0 ..< nodesCount { - let field = schema.fields(at: nodeIndex)! - let loadInfo = DataLoadInfo(recordBatch: recordBatch, field: field, - nodeIndex: nodeIndex, bufferIndex: bufferIndex, - fileData: data, messageOffset: messageEndOffset) - var result: Result - if isFixedPrimitive(field.typeType) { - result = loadPrimitiveData(loadInfo) - bufferIndex += 2 - } else { - result = loadVariableData(loadInfo) - bufferIndex += 3 + let batchData = RecordBatchData(recordBatch, schema: schema) + let loadInfo = DataLoadInfo(fileData: data, + messageOffset: messageEndOffset, + batchData: batchData) + while !batchData.isDone() { + guard let field = batchData.nextField() else { + return .failure(.invalid("Field not found")) } + let result = loadField(loadInfo, field: field) switch result { case .success(let holder): columns.append(holder) diff --git a/swift/Arrow/Sources/Arrow/ArrowReaderHelper.swift b/swift/Arrow/Sources/Arrow/ArrowReaderHelper.swift index 22c0672b27eac..48c6fd855073a 100644 --- a/swift/Arrow/Sources/Arrow/ArrowReaderHelper.swift +++ b/swift/Arrow/Sources/Arrow/ArrowReaderHelper.swift @@ -117,19 +117,42 @@ private func makeFixedHolder( } } + func makeStructHolder( + _ field: ArrowField, + buffers: [ArrowBuffer], + nullCount: UInt, + children: [ArrowData], + rbLength: UInt +) -> Result { + do { + let arrowData = try ArrowData(field.type, + buffers: buffers, children: children, + nullCount: nullCount, length: rbLength) + return .success(ArrowArrayHolderImpl(try StructArray(arrowData))) + } catch let error as ArrowError { + return .failure(error) + } catch { + return .failure(.unknownError("\(error)")) + } +} + func makeArrayHolder( _ field: org_apache_arrow_flatbuf_Field, buffers: [ArrowBuffer], - nullCount: UInt + nullCount: UInt, + children: [ArrowData]?, + rbLength: UInt ) -> Result { let arrowField = fromProto(field: field) - return makeArrayHolder(arrowField, buffers: buffers, nullCount: nullCount) + return makeArrayHolder(arrowField, buffers: buffers, nullCount: nullCount, children: children, rbLength: rbLength) } func makeArrayHolder( // swiftlint:disable:this cyclomatic_complexity _ field: ArrowField, buffers: [ArrowBuffer], - nullCount: UInt + nullCount: UInt, + children: [ArrowData]?, + rbLength: UInt ) -> Result { let typeId = field.type.id switch typeId { @@ -159,12 +182,12 @@ func makeArrayHolder( // swiftlint:disable:this cyclomatic_complexity return makeStringHolder(buffers, nullCount: nullCount) case .binary: return makeBinaryHolder(buffers, nullCount: nullCount) - case .date32: + case .date32, .date64: return makeDateHolder(field, buffers: buffers, nullCount: nullCount) - case .time32: - return makeTimeHolder(field, buffers: buffers, nullCount: nullCount) - case .time64: + case .time32, .time64: return makeTimeHolder(field, buffers: buffers, nullCount: nullCount) + case .strct: + return makeStructHolder(field, buffers: buffers, nullCount: nullCount, children: children!, rbLength: rbLength) default: return .failure(.unknownType("Type \(typeId) currently not supported")) } @@ -187,7 +210,16 @@ func isFixedPrimitive(_ type: org_apache_arrow_flatbuf_Type_) -> Bool { } } -func findArrowType( // swiftlint:disable:this cyclomatic_complexity +func isNestedType(_ type: org_apache_arrow_flatbuf_Type_) -> Bool { + switch type { + case .struct_: + return true + default: + return false + } +} + +func findArrowType( // swiftlint:disable:this cyclomatic_complexity function_body_length _ field: org_apache_arrow_flatbuf_Field) -> ArrowType { let type = field.typeType switch type { @@ -229,6 +261,17 @@ func findArrowType( // swiftlint:disable:this cyclomatic_complexity } return ArrowTypeTime64(timeType.unit == .microsecond ? .microseconds : .nanoseconds) + case .struct_: + _ = field.type(type: org_apache_arrow_flatbuf_Struct_.self)! + var fields = [ArrowField]() + for index in 0.. Date: Fri, 26 Jul 2024 01:12:44 +0200 Subject: [PATCH 23/73] GH-43418: [CI] Add wheels and java-jars to vcpkg group for tasks (#43419) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Grouping the vcpkg related tasks for CI purposes. ### What changes are included in this PR? Adding missing vcpkg jobs to the vcpkg group ### Are these changes tested? I will trigger CI to validate those are run ### Are there any user-facing changes? No * GitHub Issue: #43418 Authored-by: Raúl Cumplido Signed-off-by: Jacob Wujciak-Jens --- dev/tasks/tasks.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 32534e80528af..fe6d7fa22dd98 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -98,6 +98,8 @@ groups: vcpkg: - test-*vcpkg* + - wheel-* + - java-jars integration: - test-*dask* From 9174bb7deee35f9edcbf1ca2c6aae5c53fd909ca Mon Sep 17 00:00:00 2001 From: Vibhatha Lakmal Abeykoon Date: Fri, 26 Jul 2024 20:55:16 +0530 Subject: [PATCH 24/73] GH-43377: [Java][CI] Java-Jars CI is Failing with a linking error on macOS (#43385) ### Rationale for this change For `googletest`, we have installation via BUNDLED and brew, a version mismatch from one of these options could cause conflicts and linking related issues. Preferring BUNDLED version, this PR uninstalls the brew installation of `googletest`. ### What changes are included in this PR? Removing brew installation of `googletest` ### Are these changes tested? Yes. ### Are there any user-facing changes? No * GitHub Issue: #43377 Authored-by: Vibhatha Abeykoon Signed-off-by: Dane Pitkin --- dev/tasks/java-jars/github.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/dev/tasks/java-jars/github.yml b/dev/tasks/java-jars/github.yml index 9493be05be6ee..ba988f893148f 100644 --- a/dev/tasks/java-jars/github.yml +++ b/dev/tasks/java-jars/github.yml @@ -83,7 +83,7 @@ jobs: - { runs_on: ["macos-13"], arch: "x86_64"} - { runs_on: ["macos-14"], arch: "aarch_64" } env: - MACOSX_DEPLOYMENT_TARGET: "10.15" + MACOSX_DEPLOYMENT_TARGET: "14.0" steps: {{ macros.github_checkout_arrow()|indent }} - name: Set up Python @@ -140,6 +140,12 @@ jobs: brew uninstall protobuf brew bundle --file=arrow/java/Brewfile + + # We want to use the bundled googletest for static linking. Since + # both BUNDLED and brew options are enabled, it could cause a conflict + # when there is a version mismatch. + # We uninstall googletest to ensure using the bundled googletest. + brew uninstall googletest - name: Build C++ libraries env: {{ macros.github_set_sccache_envvars()|indent(8) }} From e5c3659977bd399f4713911c111906c2d0f8152b Mon Sep 17 00:00:00 2001 From: Paul Taylor <178183+trxcllnt@users.noreply.github.com> Date: Fri, 26 Jul 2024 09:12:36 -0700 Subject: [PATCH 25/73] GH-43340: [JS] Fix build on node v22 and fix `arrow2csv` bin extension (#43342) Fixes #42229 Fixes #43340 Fixes #43341 * GitHub Issue: #43340 --- js/gulp/closure-task.js | 13 +++++++------ js/gulp/package-task.js | 11 ++++++----- js/gulp/util.js | 3 +-- js/package.json | 1 - js/yarn.lock | 4 ---- 5 files changed, 14 insertions(+), 18 deletions(-) diff --git a/js/gulp/closure-task.js b/js/gulp/closure-task.js index 80f841bf729f1..7a76d21d46100 100644 --- a/js/gulp/closure-task.js +++ b/js/gulp/closure-task.js @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -import { targetDir, mainExport, esmRequire, gCCLanguageNames, publicModulePaths, observableFromStreams, shouldRunInChildProcess, spawnGulpCommandInChildProcess } from "./util.js"; +import { targetDir, mainExport, gCCLanguageNames, publicModulePaths, observableFromStreams, shouldRunInChildProcess, spawnGulpCommandInChildProcess } from "./util.js"; import fs from 'node:fs'; import gulp from 'gulp'; @@ -48,12 +48,13 @@ export const closureTask = ((cache) => memoizeTask(cache, async function closure const externs = Path.join(`${out}/${mainExport}.externs.js`); const entry_point = Path.join(`${src}/${mainExport}.dom.cls.js`); - const exportedImports = publicModulePaths(srcAbsolute).reduce((entries, publicModulePath) => [ - ...entries, { + const exportedImports = []; + for (const publicModulePath of publicModulePaths(srcAbsolute)) { + exportedImports.push({ publicModulePath, - exports_: getPublicExportedNames(esmRequire(publicModulePath)) - } - ], []); + exports_: getPublicExportedNames(await import(`file://${publicModulePath}`)) + }); + } await mkdirp(out); diff --git a/js/gulp/package-task.js b/js/gulp/package-task.js index 0b0f4cfa20b8b..36e0e57b9a7ae 100644 --- a/js/gulp/package-task.js +++ b/js/gulp/package-task.js @@ -40,7 +40,6 @@ export default packageTask; const createMainPackageJson = (target, format) => (orig) => ({ ...createTypeScriptPackageJson(target, format)(orig), - bin: orig.bin, name: npmPkgName, type: 'commonjs', main: `${mainExport}.node.js`, @@ -90,7 +89,6 @@ const createMainPackageJson = (target, format) => (orig) => ({ const createTypeScriptPackageJson = (target, format) => (orig) => ({ ...createScopedPackageJSON(target, format)(orig), - bin: undefined, main: `${mainExport}.node.ts`, module: `${mainExport}.node.ts`, types: `${mainExport}.node.ts`, @@ -108,6 +106,9 @@ const createScopedPackageJSON = (target, format) => (({ name, ...orig }) => packageJSONFields.reduce( (xs, key) => ({ ...xs, [key]: xs[key] || orig[key] }), { + bin: Object.entries(orig.bin).reduce((xs, [key, val]) => ({ + ...xs, [key]: val.replace('.cjs', '.js') + }), {}), // un-set version, since it's automatically applied during the release process version: undefined, // set the scoped package name (e.g. "@apache-arrow/esnext-esm") @@ -120,11 +121,11 @@ const createScopedPackageJSON = (target, format) => (({ name, ...orig }) => // set "main" to "Arrow" if building scoped UMD target, otherwise "Arrow.node" main: format === 'umd' ? `${mainExport}.js` : `${mainExport}.node.js`, // set "type" to `module` or `commonjs` (https://nodejs.org/api/packages.html#packages_type) - type: format === 'esm' ? `module` : `commonjs`, + type: format === 'esm' || format === 'cls' ? `module` : `commonjs`, // set "module" if building scoped ESM target - module: format === 'esm' ? `${mainExport}.node.js` : undefined, + module: format === 'esm' || format === 'cls' ? `${mainExport}.node.js` : undefined, // set "sideEffects" to false as a hint to Webpack that it's safe to tree-shake the ESM target - sideEffects: format === 'esm' ? false : undefined, + sideEffects: format === 'esm' || format === 'cls' ? false : undefined, // include "esm" settings for https://www.npmjs.com/package/esm if building scoped ESM target esm: format === `esm` ? { mode: `auto`, sourceMap: true } : undefined, // set "types" to "Arrow.dom" if building scoped UMD target, otherwise "Arrow.node" diff --git a/js/gulp/util.js b/js/gulp/util.js index 2ce756f4acafa..a96c0891b87ed 100644 --- a/js/gulp/util.js +++ b/js/gulp/util.js @@ -27,7 +27,6 @@ import { ReplaySubject, empty as ObservableEmpty, throwError as ObservableThrow, import { share, flatMap, takeUntil, defaultIfEmpty, mergeWith } from 'rxjs/operators'; const asyncDone = util.promisify(asyncDoneSync); import { createRequire } from 'node:module'; -import esmRequire from './esm-require.cjs' const require = createRequire(import.meta.url); @@ -177,7 +176,7 @@ export { knownTargets, knownModules, tasksToSkipPerTargetOrFormat, gCCLanguageNames, taskName, packageName, tsconfigName, targetDir, combinations, observableFromStreams, - publicModulePaths, esmRequire, shouldRunInChildProcess, spawnGulpCommandInChildProcess, + publicModulePaths, shouldRunInChildProcess, spawnGulpCommandInChildProcess, }; export const targetAndModuleCombinations = [...combinations(targets, modules)]; diff --git a/js/package.json b/js/package.json index c754f5b39c201..4edff4d363183 100644 --- a/js/package.json +++ b/js/package.json @@ -84,7 +84,6 @@ "eslint": "8.57.0", "eslint-plugin-jest": "28.5.0", "eslint-plugin-unicorn": "54.0.0", - "esm": "https://github.com/jsg2021/esm/releases/download/v3.x.x-pr883/esm-3.x.x-pr883.tgz", "gulp": "4.0.2", "glob": "10.4.1", "google-closure-compiler": "20240317.0.0", diff --git a/js/yarn.lock b/js/yarn.lock index cfa45edef4f00..37ef0d09ca005 100644 --- a/js/yarn.lock +++ b/js/yarn.lock @@ -3140,10 +3140,6 @@ eslint@8.57.0: strip-ansi "^6.0.1" text-table "^0.2.0" -"esm@https://github.com/jsg2021/esm/releases/download/v3.x.x-pr883/esm-3.x.x-pr883.tgz": - version "3.2.25" - resolved "https://github.com/jsg2021/esm/releases/download/v3.x.x-pr883/esm-3.x.x-pr883.tgz#c463cfa4e14aceea6b7cd7e669ef90de072ea60a" - esniff@^2.0.1: version "2.0.1" resolved "https://registry.yarnpkg.com/esniff/-/esniff-2.0.1.tgz#a4d4b43a5c71c7ec51c51098c1d8a29081f9b308" From 2101844ca784fb97c80402cf2d3335e79c507c88 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Fri, 26 Jul 2024 17:24:45 -0300 Subject: [PATCH 26/73] GH-43429: [C++][FlightRPC] Fix Flight UCX build issues (#43430) ### Rationale for this change Fixing compilation errors. ### What changes are included in this PR? - Casts of integer types - Fixing the name of functions (these mistakes weren't caught because UCX is not built on CI) ### Are these changes tested? Locally by building and running `arrow-flight-transport-ucx-test`. * GitHub Issue: #43429 Authored-by: Felipe Oliveira Carvalho Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/flight/transport/ucx/ucx_client.cc | 2 +- cpp/src/arrow/flight/transport/ucx/ucx_server.cc | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/flight/transport/ucx/ucx_client.cc b/cpp/src/arrow/flight/transport/ucx/ucx_client.cc index 946ac2d176203..a78b6f825a0e9 100644 --- a/cpp/src/arrow/flight/transport/ucx/ucx_client.cc +++ b/cpp/src/arrow/flight/transport/ucx/ucx_client.cc @@ -118,7 +118,7 @@ class ClientConnection { params.flags = UCP_EP_PARAMS_FLAGS_CLIENT_SERVER; params.name = "UcxClientImpl"; params.sockaddr.addr = reinterpret_cast(&connect_addr); - params.sockaddr.addrlen = addrlen; + params.sockaddr.addrlen = static_cast(addrlen); auto status = ucp_ep_create(ucp_worker_->get(), ¶ms, &remote_endpoint_); RETURN_NOT_OK(FromUcsStatus("ucp_ep_create", status)); diff --git a/cpp/src/arrow/flight/transport/ucx/ucx_server.cc b/cpp/src/arrow/flight/transport/ucx/ucx_server.cc index 55ff138348812..b1096ece77b1b 100644 --- a/cpp/src/arrow/flight/transport/ucx/ucx_server.cc +++ b/cpp/src/arrow/flight/transport/ucx/ucx_server.cc @@ -258,7 +258,7 @@ class UcxServerImpl : public arrow::flight::internal::ServerTransport { params.field_mask = UCP_LISTENER_PARAM_FIELD_SOCK_ADDR | UCP_LISTENER_PARAM_FIELD_CONN_HANDLER; params.sockaddr.addr = reinterpret_cast(&listen_addr); - params.sockaddr.addrlen = addrlen; + params.sockaddr.addrlen = static_cast(addrlen); params.conn_handler.cb = HandleIncomingConnection; params.conn_handler.arg = this; @@ -376,7 +376,7 @@ class UcxServerImpl : public arrow::flight::internal::ServerTransport { std::unique_ptr info; std::string response; SERVER_RETURN_NOT_OK(driver, base_->GetFlightInfo(context, descriptor, &info)); - SERVER_RETURN_NOT_OK(driver, info->DoSerializeToString(&response)); + SERVER_RETURN_NOT_OK(driver, info->SerializeToString(&response)); RETURN_NOT_OK(driver->SendFrame(FrameType::kBuffer, reinterpret_cast(response.data()), static_cast(response.size()))); @@ -397,7 +397,7 @@ class UcxServerImpl : public arrow::flight::internal::ServerTransport { std::unique_ptr info; std::string response; SERVER_RETURN_NOT_OK(driver, base_->PollFlightInfo(context, descriptor, &info)); - SERVER_RETURN_NOT_OK(driver, info->DoSerializeToString(&response)); + SERVER_RETURN_NOT_OK(driver, info->SerializeToString(&response)); RETURN_NOT_OK(driver->SendFrame(FrameType::kBuffer, reinterpret_cast(response.data()), static_cast(response.size()))); From aaeff72dd9cb4658913fde3d176416be9a93ebe0 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Sat, 27 Jul 2024 00:09:40 +0200 Subject: [PATCH 27/73] GH-43228: [C++] Fix Abseil compile error on GCC 13 (#43157) ### Rationale for this change When trying to compile Arrow with GCC 13, it fails due to ABSEIL missing a `` include, this PR addresses the issue by adding the missing include. There have been past reports for this issue too: https://github.com/apache/arrow/issues/36969 This is a more minimal fix that tries to avoid the complexity of previous attempts like https://github.com/apache/arrow/pull/43147 and https://github.com/apache/arrow/pull/37066 which involved updating Abseil and facing additional issues to fix. ### What changes are included in this PR? Add the missing include when GCC>=13 ### Are these changes tested? They are tested by the existing compile infrastructure and testsuite and by adding a new GCC-13 based CPP test environment for bundled builds. ### Are there any user-facing changes? No, all behaviours should remain the same * GitHub Issue: #43228 Lead-authored-by: Alessandro Molina Co-authored-by: Sutou Kouhei Signed-off-by: Jacob Wujciak-Jens --- ci/docker/ubuntu-24.04-cpp-minimal.dockerfile | 104 ++++++++++++++++++ cpp/cmake_modules/ThirdpartyToolchain.cmake | 4 + dev/tasks/tasks.yml | 9 ++ r/configure | 14 --- 4 files changed, 117 insertions(+), 14 deletions(-) create mode 100644 ci/docker/ubuntu-24.04-cpp-minimal.dockerfile diff --git a/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile new file mode 100644 index 0000000000000..a995ab2a8bc2d --- /dev/null +++ b/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile @@ -0,0 +1,104 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +ARG base=amd64/ubuntu:24.04 +FROM ${base} + +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + +RUN echo "debconf debconf/frontend select Noninteractive" | \ + debconf-set-selections + +RUN apt-get update -y -q && \ + apt-get install -y -q \ + build-essential \ + ccache \ + cmake \ + curl \ + git \ + libssl-dev \ + libcurl4-openssl-dev \ + python3-pip \ + tzdata \ + tzdata-legacy \ + wget && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists* + +# Installs LLVM toolchain, for Gandiva and testing other compilers +# +# Note that this is installed before the base packages to improve iteration +# while debugging package list with docker build. +ARG llvm +RUN latest_system_llvm=14 && \ + if [ ${llvm} -gt ${latest_system_llvm} ]; then \ + apt-get update -y -q && \ + apt-get install -y -q --no-install-recommends \ + apt-transport-https \ + ca-certificates \ + gnupg \ + lsb-release \ + wget && \ + wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - && \ + code_name=$(lsb_release --codename --short) && \ + if [ ${llvm} -gt 10 ]; then \ + echo "deb https://apt.llvm.org/${code_name}/ llvm-toolchain-${code_name}-${llvm} main" > \ + /etc/apt/sources.list.d/llvm.list; \ + fi; \ + fi && \ + apt-get update -y -q && \ + apt-get install -y -q --no-install-recommends \ + clang-${llvm} \ + llvm-${llvm}-dev && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists* + +COPY ci/scripts/install_minio.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_minio.sh latest /usr/local + +COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_gcs_testbench.sh default + +COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin + +ENV ARROW_ACERO=ON \ + ARROW_AZURE=OFF \ + ARROW_BUILD_TESTS=ON \ + ARROW_DATASET=ON \ + ARROW_FLIGHT=ON \ + ARROW_GANDIVA=ON \ + ARROW_GCS=ON \ + ARROW_HDFS=ON \ + ARROW_HOME=/usr/local \ + ARROW_INSTALL_NAME_RPATH=OFF \ + ARROW_ORC=ON \ + ARROW_PARQUET=ON \ + ARROW_S3=ON \ + ARROW_USE_CCACHE=ON \ + ARROW_WITH_BROTLI=ON \ + ARROW_WITH_BZ2=ON \ + ARROW_WITH_LZ4=ON \ + ARROW_WITH_OPENTELEMETRY=OFF \ + ARROW_WITH_SNAPPY=ON \ + ARROW_WITH_ZLIB=ON \ + ARROW_WITH_ZSTD=ON \ + CMAKE_GENERATOR="Unix Makefiles" \ + PARQUET_BUILD_EXAMPLES=ON \ + PARQUET_BUILD_EXECUTABLES=ON \ + PATH=/usr/lib/ccache/:$PATH \ + PYTHON=python3 diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 5b89a831ff7fe..1c8c40d6f9c52 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -2882,6 +2882,10 @@ macro(build_absl) set(ABSL_INCLUDE_DIR "${ABSL_PREFIX}/include") set(ABSL_CMAKE_ARGS "${EP_COMMON_CMAKE_ARGS}" -DABSL_RUN_TESTS=OFF "-DCMAKE_INSTALL_PREFIX=${ABSL_PREFIX}") + if(CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0) + set(ABSL_CXX_FLAGS "${EP_CXX_FLAGS} -include stdint.h") + list(APPEND ABSL_CMAKE_ARGS "-DCMAKE_CXX_FLAGS=${ABSL_CXX_FLAGS}") + endif() set(ABSL_BUILD_BYPRODUCTS) set(ABSL_LIBRARIES) diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index fe6d7fa22dd98..5c8a7c4990d7a 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1060,6 +1060,15 @@ tasks: UBUNTU: 20.04 image: ubuntu-cpp-bundled + test-ubuntu-24.04-cpp-gcc-13-bundled: + ci: github + template: docker-tests/github.linux.yml + params: + env: + UBUNTU: 24.04 + GCC_VERSION: 13 + image: ubuntu-cpp-bundled + test-ubuntu-24.04-cpp: ci: github template: docker-tests/github.linux.yml diff --git a/r/configure b/r/configure index 0882ee6719c4b..e1f0bad378719 100755 --- a/r/configure +++ b/r/configure @@ -95,20 +95,6 @@ if [ ! "`${R_HOME}/bin/R CMD config CXX17`" ]; then exit 1 fi -# GH-36969: The version of Abseil used in the bundled build won't build on -# gcc-13. As a workaround for the 13.0.0 release, explicitly set -# ARROW_WITH_GOOGLE_CLOUD_CPP to OFF (if not already set) -if [ -z "$ARROW_GCS" ]; then - CXX17=`${R_HOME}/bin/R CMD config CXX17` - CXX17_VERSION=`$CXX17 --version` - if echo "$CXX17_VERSION" | grep -e "g++" > /dev/null ; then - if echo "$CXX17_VERSION" | grep -e "13.[0-9]\+.[0-9]\+" > /dev/null ; then - echo "*** Disabling Arrow build with GCS on gcc-13." - echo "*** Set ARROW_GCS=ON to explicitly enable." - export ARROW_GCS="OFF" - fi - fi -fi # Test if pkg-config is available to use if ${PKG_CONFIG} --version >/dev/null 2>&1; then From 187197c369058f7d1377c1b161c469a9e4542caf Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Sat, 27 Jul 2024 11:53:05 -0500 Subject: [PATCH 28/73] GH-43349: [R] Fix altrep string columns from readr (#43351) ### Rationale for this change To resolve the reverse dependency issue with `parquetize` ### What changes are included in this PR? One step towards resolving the issue ### Are these changes tested? yes ### Are there any user-facing changes? no * GitHub Issue: #43349 Authored-by: Jonathan Keane Signed-off-by: Jonathan Keane --- r/src/arrow_cpp11.h | 11 ++++++++++- r/tests/testthat/test-csv.R | 17 +++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/r/src/arrow_cpp11.h b/r/src/arrow_cpp11.h index b2ed66b83c3d1..073b577d63ade 100644 --- a/r/src/arrow_cpp11.h +++ b/r/src/arrow_cpp11.h @@ -138,7 +138,13 @@ inline R_xlen_t r_string_size(SEXP s) { } // namespace unsafe inline SEXP utf8_strings(SEXP x) { - return cpp11::unwind_protect([x] { + return cpp11::unwind_protect([&] { + // ensure that x is not actually altrep first this also ensures that + // x is not altrep even after it is materialized + bool was_altrep = ALTREP(x); + if (was_altrep) { + x = PROTECT(Rf_duplicate(x)); + } R_xlen_t n = XLENGTH(x); // if `x` is an altrep of some sort, this will @@ -152,6 +158,9 @@ inline SEXP utf8_strings(SEXP x) { SET_STRING_ELT(x, i, Rf_mkCharCE(Rf_translateCharUTF8(s), CE_UTF8)); } } + if (was_altrep) { + UNPROTECT(1); + } return x; }); } diff --git a/r/tests/testthat/test-csv.R b/r/tests/testthat/test-csv.R index 36f1f229a6085..a6291ebda09cc 100644 --- a/r/tests/testthat/test-csv.R +++ b/r/tests/testthat/test-csv.R @@ -738,5 +738,22 @@ test_that("read_csv2_arrow correctly parses comma decimals", { tf <- tempfile() writeLines("x;y\n1,2;c", con = tf) expect_equal(read_csv2_arrow(tf), tibble(x = 1.2, y = "c")) +}) + +test_that("altrep columns can roundtrip to table", { + tf <- tempfile() + on.exit(unlink(tf)) + write.csv(tbl, tf, row.names = FALSE) + + # read in, some columns will be altrep by default + new_df <- read_csv_arrow(tf) + expect_equal(tbl, as_tibble(arrow_table(new_df))) + + # but also if we materialize the vector + # this could also be accomplished with printing + new_df <- read_csv_arrow(tf) + test_arrow_altrep_force_materialize(new_df$chr) + # we should still be able to turn this into a table + expect_equal(tbl, as_tibble(arrow_table(new_df))) }) From 373ce81bd40ee9bfd07b8cbec9821526096296bc Mon Sep 17 00:00:00 2001 From: Vibhatha Lakmal Abeykoon Date: Mon, 29 Jul 2024 10:31:11 +0530 Subject: [PATCH 29/73] MINOR: [Java][CI] Update the order of commands in Java-Jars CI (#43462) ### Rationale for this change `googletest` is installed by the `Brewfile` associated with cpp not Java. So move the order of uninstalling in that order. ### What changes are included in this PR? Change the order of commands to suit the installation objectives. ### Are these changes tested? Existing tests. ### Are there any user-facing changes? No Authored-by: Vibhatha Lakmal Abeykoon Signed-off-by: Sutou Kouhei --- dev/tasks/java-jars/github.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/dev/tasks/java-jars/github.yml b/dev/tasks/java-jars/github.yml index ba988f893148f..3747788c091ba 100644 --- a/dev/tasks/java-jars/github.yml +++ b/dev/tasks/java-jars/github.yml @@ -138,14 +138,13 @@ jobs: # used on test We uninstall Homebrew's Protobuf to ensure using # bundled Protobuf. brew uninstall protobuf - - brew bundle --file=arrow/java/Brewfile - # We want to use the bundled googletest for static linking. Since # both BUNDLED and brew options are enabled, it could cause a conflict # when there is a version mismatch. # We uninstall googletest to ensure using the bundled googletest. brew uninstall googletest + + brew bundle --file=arrow/java/Brewfile - name: Build C++ libraries env: {{ macros.github_set_sccache_envvars()|indent(8) }} From 07a8d7db46043681244cfbed9446f16a32702331 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Mon, 29 Jul 2024 01:21:09 -0400 Subject: [PATCH 30/73] GH-43437: [Java] Update protobuf from 3.25.1 to 3.25.4 (#43436) ### Rationale for this change Update Java protobuf minor version. 3.25.4 includes the `Automatic-Module-Name`. ### What changes are included in this PR? * Update java protobuf to 3.25.4 ### Are these changes tested? CI ### Are there any user-facing changes? No. * GitHub Issue: #43437 Authored-by: Dane Pitkin Signed-off-by: David Li --- java/flight/flight-core/src/main/java/module-info.java | 4 ++-- java/flight/flight-sql/src/main/java/module-info.java | 2 +- java/gandiva/src/main/java/module-info.java | 2 +- java/pom.xml | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/java/flight/flight-core/src/main/java/module-info.java b/java/flight/flight-core/src/main/java/module-info.java index e668fe6149fb9..28dbb732c4713 100644 --- a/java/flight/flight-core/src/main/java/module-info.java +++ b/java/flight/flight-core/src/main/java/module-info.java @@ -26,6 +26,8 @@ requires com.fasterxml.jackson.databind; requires com.google.common; requires com.google.errorprone.annotations; + requires com.google.protobuf; + requires com.google.protobuf.util; requires io.grpc; requires io.grpc.internal; requires io.grpc.netty; @@ -38,7 +40,5 @@ requires org.apache.arrow.format; requires org.apache.arrow.memory.core; requires org.apache.arrow.vector; - requires protobuf.java; - requires protobuf.java.util; requires org.slf4j; } diff --git a/java/flight/flight-sql/src/main/java/module-info.java b/java/flight/flight-sql/src/main/java/module-info.java index cb3835117daf6..42be9ce6d92db 100644 --- a/java/flight/flight-sql/src/main/java/module-info.java +++ b/java/flight/flight-sql/src/main/java/module-info.java @@ -21,10 +21,10 @@ exports org.apache.arrow.flight.sql.util; requires com.google.common; + requires com.google.protobuf; requires java.sql; requires org.apache.arrow.flight.core; requires org.apache.arrow.memory.core; requires org.apache.arrow.vector; requires org.apache.commons.cli; - requires protobuf.java; } diff --git a/java/gandiva/src/main/java/module-info.java b/java/gandiva/src/main/java/module-info.java index 5bce445707a3b..49deed1857691 100644 --- a/java/gandiva/src/main/java/module-info.java +++ b/java/gandiva/src/main/java/module-info.java @@ -21,9 +21,9 @@ exports org.apache.arrow.gandiva.ipc; requires com.google.common; + requires com.google.protobuf; requires org.apache.arrow.format; requires org.apache.arrow.memory.core; requires org.apache.arrow.vector; requires org.slf4j; - requires protobuf.java; } diff --git a/java/pom.xml b/java/pom.xml index 997257c71b6e9..1ed263d7db878 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -98,7 +98,7 @@ under the License. 33.2.1-jre 4.1.112.Final 1.65.0 - 3.25.1 + 3.25.4 2.17.2 3.4.0 24.3.25 From c980ebfeac2aa53fe95741c61184212b0636cfc8 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Mon, 29 Jul 2024 01:29:21 -0400 Subject: [PATCH 31/73] GH-43432: [Java][Packaging] Clean up java-jars job (#43431) ### Rationale for this change 1) Remove maven module references 2) Fix warning in java-jars job: ``` Warning: Some problems were encountered while building the effective model for org.apache.arrow:arrow-bom:pom:18.0.0-SNAPSHOT Warning: 'parent.relativePath' of POM org.apache.arrow:arrow-bom:18.0.0-SNAPSHOT (/Users/runner/work/crossbow/crossbow/arrow/java/bom/pom.xml) points at org.apache.arrow:arrow-java-root instead of org.apache:apache, please verify your project structure @ line 23, column 11 Warning: Warning: It is highly recommended to fix these problems because they threaten the stability of your build. Warning: Warning: For this reason, future Maven versions might no longer support building such malformed projects. ``` ### What changes are included in this PR? * Delete `mvn versions:set` for removed maven module * Add empty relativePath to Arrow BOM, so it doesn't use arrow-java-root ### Are these changes tested? java-jars CI job ### Are there any user-facing changes? No * GitHub Issue: #43432 Authored-by: Dane Pitkin Signed-off-by: David Li --- dev/tasks/java-jars/github.yml | 1 - java/bom/pom.xml | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/tasks/java-jars/github.yml b/dev/tasks/java-jars/github.yml index 3747788c091ba..58c1cedb11445 100644 --- a/dev/tasks/java-jars/github.yml +++ b/dev/tasks/java-jars/github.yml @@ -255,7 +255,6 @@ jobs: pushd arrow/java mvn versions:set -DnewVersion={{ arrow.no_rc_snapshot_version }} mvn versions:set -DnewVersion={{ arrow.no_rc_snapshot_version }} -f bom - mvn versions:set -DnewVersion={{ arrow.no_rc_snapshot_version }} -f maven popd arrow/ci/scripts/java_full_build.sh \ $GITHUB_WORKSPACE/arrow \ diff --git a/java/bom/pom.xml b/java/bom/pom.xml index fe3264102144b..1f8585c801330 100644 --- a/java/bom/pom.xml +++ b/java/bom/pom.xml @@ -24,6 +24,7 @@ under the License. org.apache apache 33 + org.apache.arrow From 7acb043ffc836c05a89a55c2836be2e5f94cb579 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Mon, 29 Jul 2024 01:48:49 -0400 Subject: [PATCH 32/73] GH-43425: [Java] Upgrade JNI to version 10 (#43424) ### Rationale for this change Java 11 is the supported min version, which means we can use JNI version 10. See Version info here: https://docs.oracle.com/en/java/javase/21/docs/specs/jni/functions.html#version-information ### What changes are included in this PR? * JNI Version upgraded to 10 from 1.6 and 1.8 ### Are these changes tested? CI/CD ### Are there any user-facing changes? No * GitHub Issue: #43425 Authored-by: Dane Pitkin Signed-off-by: David Li --- docs/source/java/cdata.rst | 2 +- java/adapter/orc/src/main/cpp/jni_wrapper.cpp | 2 +- java/c/src/main/cpp/jni_wrapper.cc | 2 +- java/dataset/src/main/cpp/jni_util.cc | 2 +- java/dataset/src/main/cpp/jni_wrapper.cc | 2 +- java/gandiva/src/main/cpp/jni_common.cc | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/source/java/cdata.rst b/docs/source/java/cdata.rst index 0f30fe1031588..69f7ab0b0787c 100644 --- a/docs/source/java/cdata.rst +++ b/docs/source/java/cdata.rst @@ -366,7 +366,7 @@ This application uses JNI to call Java code, but transfers data (zero-copy) via JavaVMOption options[2]; options[0].optionString = "-Djava.class.path=cpptojava.jar"; options[1].optionString = "-DXcheck:jni:pedantic"; - vm_args.version = JNI_VERSION_1_8; + vm_args.version = JNI_VERSION_10; vm_args.nOptions = 2; vm_args.options = options; int status = JNI_CreateJavaVM(jvm, (void **) &env, &vm_args); diff --git a/java/adapter/orc/src/main/cpp/jni_wrapper.cpp b/java/adapter/orc/src/main/cpp/jni_wrapper.cpp index cc629c9c432b4..6acf1084c3337 100644 --- a/java/adapter/orc/src/main/cpp/jni_wrapper.cpp +++ b/java/adapter/orc/src/main/cpp/jni_wrapper.cpp @@ -49,7 +49,7 @@ static jmethodID orc_memory_constructor; static jclass record_batch_class; static jmethodID record_batch_constructor; -static jint JNI_VERSION = JNI_VERSION_1_6; +static jint JNI_VERSION = JNI_VERSION_10; using arrow::internal::checked_cast; using arrow::jni::ConcurrentMap; diff --git a/java/c/src/main/cpp/jni_wrapper.cc b/java/c/src/main/cpp/jni_wrapper.cc index fea53aff49f40..35c2b7787e779 100644 --- a/java/c/src/main/cpp/jni_wrapper.cc +++ b/java/c/src/main/cpp/jni_wrapper.cc @@ -43,7 +43,7 @@ jmethodID kPrivateDataGetNextMethod; jmethodID kPrivateDataGetSchemaMethod; jmethodID kCDataExceptionConstructor; -jint JNI_VERSION = JNI_VERSION_1_6; +jint JNI_VERSION = JNI_VERSION_10; class JniPendingException : public std::runtime_error { public: diff --git a/java/dataset/src/main/cpp/jni_util.cc b/java/dataset/src/main/cpp/jni_util.cc index 8e899527f6a99..1fd15696e6e5f 100644 --- a/java/dataset/src/main/cpp/jni_util.cc +++ b/java/dataset/src/main/cpp/jni_util.cc @@ -28,7 +28,7 @@ namespace arrow { namespace dataset { namespace jni { -jint JNI_VERSION = JNI_VERSION_1_6; +jint JNI_VERSION = JNI_VERSION_10; class ReservationListenableMemoryPool::Impl { public: diff --git a/java/dataset/src/main/cpp/jni_wrapper.cc b/java/dataset/src/main/cpp/jni_wrapper.cc index 79efbeb74fc54..4ef2a2ffd9206 100644 --- a/java/dataset/src/main/cpp/jni_wrapper.cc +++ b/java/dataset/src/main/cpp/jni_wrapper.cc @@ -51,7 +51,7 @@ jmethodID unreserve_memory_method; jlong default_memory_pool_id = -1L; -jint JNI_VERSION = JNI_VERSION_1_6; +jint JNI_VERSION = JNI_VERSION_10; class JniPendingException : public std::runtime_error { public: diff --git a/java/gandiva/src/main/cpp/jni_common.cc b/java/gandiva/src/main/cpp/jni_common.cc index a5dff9981ce89..ec1bb7623413a 100644 --- a/java/gandiva/src/main/cpp/jni_common.cc +++ b/java/gandiva/src/main/cpp/jni_common.cc @@ -67,7 +67,7 @@ using gandiva::ProjectorHolder; // forward declarations NodePtr ProtoTypeToNode(const gandiva::types::TreeNode& node); -static jint JNI_VERSION = JNI_VERSION_1_6; +static jint JNI_VERSION = JNI_VERSION_10; // extern refs - initialized for other modules. jclass configuration_builder_class_; From a61205fd9ee26ccdc9502058da6cc2edb07e61ac Mon Sep 17 00:00:00 2001 From: ziglerari <106024097+ziglerari@users.noreply.github.com> Date: Mon, 29 Jul 2024 09:35:59 +0300 Subject: [PATCH 33/73] GH-43447: [C++] FIlter out zero length buffers on gRPC transport (#43448) #43447 * GitHub Issue: #43447 Authored-by: Ari Zigler Signed-off-by: David Li --- cpp/src/arrow/flight/transport/grpc/serialization_internal.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/flight/transport/grpc/serialization_internal.cc b/cpp/src/arrow/flight/transport/grpc/serialization_internal.cc index 372dca7a2c4c8..3df13532b0b05 100644 --- a/cpp/src/arrow/flight/transport/grpc/serialization_internal.cc +++ b/cpp/src/arrow/flight/transport/grpc/serialization_internal.cc @@ -284,7 +284,7 @@ ::grpc::Status FlightDataSerialize(const FlightPayload& msg, ByteBuffer* out, for (const auto& buffer : ipc_msg.body_buffers) { // Buffer may be null when the row length is zero, or when all // entries are invalid. - if (!buffer) continue; + if (!buffer || buffer->size() == 0) continue; ::grpc::Slice slice; auto status = SliceFromBuffer(buffer).Value(&slice); From 7f31b362c94a1ca248a7d118c4cd3034ff214ff6 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Mon, 29 Jul 2024 04:09:04 -0500 Subject: [PATCH 34/73] GH-43450: [CI] Temporarily turn off conda jobs that are failing (#43451) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's some work ongoing to update the conda jobs #42114, but many of them have been failing for many days (some have no history of success in our [crossbow report](http://crossbow.voltrondata.com)). Let's turn those off temporarily so that we stop ignoring other failures alongside it, and we can re-enable them once we get them back running. Alternatively, we could merge those fixes and close this PR. * GitHub Issue: #43450 Authored-by: Jonathan Keane Signed-off-by: Raúl Cumplido --- dev/tasks/tasks.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 5c8a7c4990d7a..94eac92a5be0b 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -170,6 +170,16 @@ groups: - ubuntu-* - centos-* - conda-* + # Can be removed after conda recipes are synced: #42114 + - ~conda-linux-aarch64-cuda-py3 + - ~conda-linux-x64-cpu-py3 + - ~conda-win-x64-cpu-py3 + - ~conda-win-x64-cuda-py3 + - ~conda-linux-ppc64le-cuda-py3 + - ~conda-linux-aarch64-cpu-py3 + - ~conda-linux-ppc64le-cpu-py3 + - ~conda-linux-x64-cuda-py3 + - ~conda-osx-arm64-cpu-py3 - conan-* - java-jars - homebrew-cpp From 546452564fedfeefea97bb385708770ef37ee6ba Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Mon, 29 Jul 2024 07:12:22 -0500 Subject: [PATCH 35/73] MINOR: [CI] Add jonkeane to codeowners (#43452) Adding myself as codeowner to a few places that I'm more-or-less active in. Authored-by: Jonathan Keane Signed-off-by: Jonathan Keane --- .github/CODEOWNERS | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 03252657feaf1..793dbb3806f80 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -40,7 +40,7 @@ /matlab/ @kevingurney @kou @sgilmore10 /python/pyarrow/_flight.pyx @lidavidm /python/pyarrow/**/*gandiva* @wjones127 -/r/ @thisisnic +/r/ @jonkeane @thisisnic /ruby/ @kou /swift/ @kou @@ -53,7 +53,7 @@ # *.txt # PR CI and repository files -/.github/ @assignUser @kou @raulcd +/.github/ @assignUser @jonkeane @kou @raulcd .asf.yaml @assignUser @kou @raulcd .pre-commit-config.yaml @raulcd .travis.yml @assignUser @kou @raulcd @@ -61,11 +61,11 @@ appveyor.yml @assignUser @kou @raulcd # .git* # release scripts, archery etc. -/ci/ @assignUser @kou @raulcd -/dev/ @assignUser @kou @raulcd +/ci/ @assignUser @jonkeane @kou @raulcd +/dev/ @assignUser @jonkeane @kou @raulcd .dockerignore @raulcd -.env @assignUser @kou @raulcd -docker-compose.yml @assignUser @kou @raulcd +.env @assignUser @jonkeane @kou @raulcd +docker-compose.yml @assignUser @jonkeane @kou @raulcd # R specific packaging tooling /r/configure* @assignUser From f1ad618ec60f60cfc345417a4d8ffa6efa80a115 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 29 Jul 2024 16:31:49 -0700 Subject: [PATCH 36/73] MINOR: [C#] Bump Grpc.Net.Client and System.Runtime.CompilerServices.Unsafe in /csharp (#43475) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [Grpc.Net.Client](https://github.com/grpc/grpc-dotnet) and [System.Runtime.CompilerServices.Unsafe](https://github.com/dotnet/runtime). These dependencies needed to be updated together. Updates `Grpc.Net.Client` from 2.64.0 to 2.65.0
Release notes

Sourced from Grpc.Net.Client's releases.

Release v2.65.0-pre1

What's Changed

New Contributors

Full Changelog: https://github.com/grpc/grpc-dotnet/compare/v2.64.0...v2.65.0-pre1

Commits

Updates `System.Runtime.CompilerServices.Unsafe` from 4.7.1 to 6.0.0
Release notes

Sourced from System.Runtime.CompilerServices.Unsafe's releases.

.NET 6.0

Release

.NET 6.0 RC 2

Release

.NET 6.0 RC 1

Release

.NET 6.0 Preview 7

Release

.NET 6.0 Preview 6

Release

.NET 6.0 Preview 5

Release

.NET 6.0 Preview 4

Release

.NET 6.0 Preview 3

Release

.NET 6.0 Preview 2

Release

.NET 6.0 Preview 1

Release

.NET 5.0.17

Release

.NET 5 is now out of support. We recommend using .NET 6.

.NET 5.0.16

Release

.NET 5.0.15

Release

.NET 5.0.14

Release

.NET 5.0.13

Release

.NET 5.0.11

Release

... (truncated)

Commits

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Curt Hagenlocher --- csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj index 5334f877873e4..c2081cbe1aa68 100644 --- a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj +++ b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj @@ -6,7 +6,7 @@ - + From 35d99e9871c302d4c1bdd408577879e7aa7c36f5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 29 Jul 2024 16:33:13 -0700 Subject: [PATCH 37/73] MINOR: [C#] Bump Grpc.AspNetCore, Grpc.AspNetCore.Server, System.Runtime.CompilerServices.Unsafe and Grpc.Net.Client in /csharp (#43474) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [//]: # (dependabot-start) ⚠️ **Dependabot is rebasing this PR** ⚠️ Rebasing might not happen immediately, so don't worry if this takes some time. Note: if you make any changes to this PR yourself, they will take precedence over the rebase. --- [//]: # (dependabot-end) Bumps [Grpc.AspNetCore](https://github.com/grpc/grpc-dotnet), [Grpc.AspNetCore.Server](https://github.com/grpc/grpc-dotnet), [System.Runtime.CompilerServices.Unsafe](https://github.com/dotnet/runtime) and [Grpc.Net.Client](https://github.com/grpc/grpc-dotnet). These dependencies needed to be updated together. Updates `Grpc.AspNetCore` from 2.63.0 to 2.65.0
Release notes

Sourced from Grpc.AspNetCore's releases.

Release v2.65.0-pre1

What's Changed

New Contributors

Full Changelog: https://github.com/grpc/grpc-dotnet/compare/v2.64.0...v2.65.0-pre1

Release v2.64.0

What's Changed

Full Changelog: https://github.com/grpc/grpc-dotnet/compare/v2.63.0...v2.64.0

Release v2.64.0-pre1

What's Changed

... (truncated)

Commits

Updates `Grpc.AspNetCore.Server` from 2.64.0 to 2.65.0
Release notes

Sourced from Grpc.AspNetCore.Server's releases.

Release v2.65.0-pre1

What's Changed

New Contributors

Full Changelog: https://github.com/grpc/grpc-dotnet/compare/v2.64.0...v2.65.0-pre1

Commits

Updates `System.Runtime.CompilerServices.Unsafe` from 4.7.1 to 6.0.0
Release notes

Sourced from System.Runtime.CompilerServices.Unsafe's releases.

.NET 6.0

Release

.NET 6.0 RC 2

Release

.NET 6.0 RC 1

Release

.NET 6.0 Preview 7

Release

.NET 6.0 Preview 6

Release

.NET 6.0 Preview 5

Release

.NET 6.0 Preview 4

Release

.NET 6.0 Preview 3

Release

.NET 6.0 Preview 2

Release

.NET 6.0 Preview 1

Release

.NET 5.0.17

Release

.NET 5 is now out of support. We recommend using .NET 6.

.NET 5.0.16

Release

.NET 5.0.15

Release

.NET 5.0.14

Release

.NET 5.0.13

Release

.NET 5.0.11

Release

... (truncated)

Commits

Updates `Grpc.Net.Client` from 2.64.0 to 2.65.0
Release notes

Sourced from Grpc.Net.Client's releases.

Release v2.65.0-pre1

What's Changed

New Contributors

Full Changelog: https://github.com/grpc/grpc-dotnet/compare/v2.64.0...v2.65.0-pre1

Commits

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Curt Hagenlocher --- .../Apache.Arrow.Flight.TestWeb.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csharp/test/Apache.Arrow.Flight.TestWeb/Apache.Arrow.Flight.TestWeb.csproj b/csharp/test/Apache.Arrow.Flight.TestWeb/Apache.Arrow.Flight.TestWeb.csproj index 789fb9569edba..e6c7e174fa32f 100644 --- a/csharp/test/Apache.Arrow.Flight.TestWeb/Apache.Arrow.Flight.TestWeb.csproj +++ b/csharp/test/Apache.Arrow.Flight.TestWeb/Apache.Arrow.Flight.TestWeb.csproj @@ -5,7 +5,7 @@ - + From 95c6d77e8be8a72490a653b5ba62d7b71096f4fa Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 29 Jul 2024 18:15:30 -0700 Subject: [PATCH 38/73] MINOR: [C#] Bump Grpc.AspNetCore, Grpc.AspNetCore.Server and System.Runtime.CompilerServices.Unsafe in /csharp (#43478) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [Grpc.AspNetCore](https://github.com/grpc/grpc-dotnet), [Grpc.AspNetCore.Server](https://github.com/grpc/grpc-dotnet) and [System.Runtime.CompilerServices.Unsafe](https://github.com/dotnet/runtime). These dependencies needed to be updated together. Updates `Grpc.AspNetCore` from 2.63.0 to 2.65.0
Release notes

Sourced from Grpc.AspNetCore's releases.

Release v2.65.0-pre1

What's Changed

New Contributors

Full Changelog: https://github.com/grpc/grpc-dotnet/compare/v2.64.0...v2.65.0-pre1

Release v2.64.0

What's Changed

Full Changelog: https://github.com/grpc/grpc-dotnet/compare/v2.63.0...v2.64.0

Release v2.64.0-pre1

What's Changed

... (truncated)

Commits

Updates `Grpc.AspNetCore.Server` from 2.64.0 to 2.65.0
Release notes

Sourced from Grpc.AspNetCore.Server's releases.

Release v2.65.0-pre1

What's Changed

New Contributors

Full Changelog: https://github.com/grpc/grpc-dotnet/compare/v2.64.0...v2.65.0-pre1

Commits

Updates `System.Runtime.CompilerServices.Unsafe` from 4.7.1 to 6.0.0
Release notes

Sourced from System.Runtime.CompilerServices.Unsafe's releases.

.NET 6.0

Release

.NET 6.0 RC 2

Release

.NET 6.0 RC 1

Release

.NET 6.0 Preview 7

Release

.NET 6.0 Preview 6

Release

.NET 6.0 Preview 5

Release

.NET 6.0 Preview 4

Release

.NET 6.0 Preview 3

Release

.NET 6.0 Preview 2

Release

.NET 6.0 Preview 1

Release

.NET 5.0.17

Release

.NET 5 is now out of support. We recommend using .NET 6.

.NET 5.0.16

Release

.NET 5.0.15

Release

.NET 5.0.14

Release

.NET 5.0.13

Release

.NET 5.0.11

Release

... (truncated)

Commits

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Curt Hagenlocher From 96a6c45360fba5d952b6054c4dbc25d0615fb4e0 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 30 Jul 2024 10:51:51 +0900 Subject: [PATCH 39/73] GH-43449: [CI][Conan] Don't push used images (#43470) ### Rationale for this change Because they aren't managed by us. ### What changes are included in this PR? Don't push used images for Conan related jobs. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. Authored-by: Sutou Kouhei Signed-off-by: Jacob Wujciak-Jens --- dev/tasks/docker-tests/github.linux.yml | 1 + dev/tasks/tasks.yml | 2 ++ 2 files changed, 3 insertions(+) diff --git a/dev/tasks/docker-tests/github.linux.yml b/dev/tasks/docker-tests/github.linux.yml index 697960360cfdc..28d3203c1ed48 100644 --- a/dev/tasks/docker-tests/github.linux.yml +++ b/dev/tasks/docker-tests/github.linux.yml @@ -71,6 +71,7 @@ jobs: {% if arrow.is_default_branch() %} {{ macros.github_login_dockerhub()|indent }} - name: Push Docker Image + if: {{ push|default("true") }} shell: bash run: archery docker push {{ image }} {% endif %} diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 94eac92a5be0b..3b00bc0040bd1 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -207,6 +207,7 @@ tasks: template: docker-tests/github.linux.yml params: image: conan + push: false conan-maximum: ci: github @@ -224,6 +225,7 @@ tasks: -e ARROW_CONAN_WITH_SNAPPY=True -e ARROW_CONAN_WITH_ZSTD=True image: conan + push: false ########################### Python Minimal ############################ From a4a5562e6b5a82ff297faeac45eb2ce1f391bfec Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 30 Jul 2024 11:55:21 +0900 Subject: [PATCH 40/73] GH-43400: [C++] Ensure using bundled GoogleTest when we use bundled GoogleTest (#43465) ### Rationale for this change If we use bundled GoogleTest and system other dependencies such as Boost, our include path options may be: * `-isystem /opt/homebrew/include` (for Boost) * `-isystem build_dir/_deps/googletest-src/googletest` (for bundled GoogleTest) * `-isystem build_dir/_deps/googletest-src/googlemock` (for bundled GoogleTest) With this order, GoogleTest headers in `/opt/homebrew/include/` are used with bundled GoogleTest. It may cause link errors. ### What changes are included in this PR? This change introduces a new CMake target `arrow::GTest::gtest_headers` that has include paths for bundled GoogleTest. And it's always used as the first link library of all test program. With this change, our include path options are: * `-isystem build_dir/_deps/googletest-src/googletest` (for bundled GoogleTest) * `-isystem build_dir/_deps/googletest-src/googlemock` (for bundled GoogleTest) * `-isystem /opt/homebrew/include` (for Boost) With this order, we can always use our bundled GoogleTest. `arrow::GTest::gtest_headers` is defined only when we use bundled GoogleTest. So this doesn't change the system GoogleTest case. ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. * GitHub Issue: #43400 Authored-by: Sutou Kouhei Signed-off-by: Jacob Wujciak-Jens --- cpp/cmake_modules/BuildUtils.cmake | 5 +++++ cpp/cmake_modules/ThirdpartyToolchain.cmake | 6 ++++++ dev/tasks/java-jars/github.yml | 5 ----- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/cpp/cmake_modules/BuildUtils.cmake b/cpp/cmake_modules/BuildUtils.cmake index e7523add27223..692efa78376f4 100644 --- a/cpp/cmake_modules/BuildUtils.cmake +++ b/cpp/cmake_modules/BuildUtils.cmake @@ -721,6 +721,11 @@ function(ADD_TEST_CASE REL_TEST_NAME) "${EXECUTABLE_OUTPUT_PATH};$ENV{CONDA_PREFIX}/lib") endif() + # Ensure using bundled GoogleTest when we use bundled GoogleTest. + # ARROW_GTEST_GTEST_HEADERS is defined only when we use bundled + # GoogleTest. + target_link_libraries(${TEST_NAME} PRIVATE ${ARROW_GTEST_GTEST_HEADERS}) + if(ARG_STATIC_LINK_LIBS) # Customize link libraries target_link_libraries(${TEST_NAME} PRIVATE ${ARG_STATIC_LINK_LIBS}) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 1c8c40d6f9c52..92bd80014e8ae 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -2306,6 +2306,10 @@ function(build_gtest) install(DIRECTORY "${googletest_SOURCE_DIR}/googlemock/include/" "${googletest_SOURCE_DIR}/googletest/include/" DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}") + add_library(arrow::GTest::gtest_headers INTERFACE IMPORTED) + target_include_directories(arrow::GTest::gtest_headers + INTERFACE "${googletest_SOURCE_DIR}/googlemock/include/" + "${googletest_SOURCE_DIR}/googletest/include/") install(TARGETS gmock gmock_main gtest gtest_main EXPORT arrow_testing_targets RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}" @@ -2350,12 +2354,14 @@ if(ARROW_TESTING) string(APPEND ARROW_TESTING_PC_LIBS " $") endif() + set(ARROW_GTEST_GTEST_HEADERS) set(ARROW_GTEST_GMOCK GTest::gmock) set(ARROW_GTEST_GTEST GTest::gtest) set(ARROW_GTEST_GTEST_MAIN GTest::gtest_main) else() string(APPEND ARROW_TESTING_PC_CFLAGS " -I\${includedir}/arrow-gtest") string(APPEND ARROW_TESTING_PC_LIBS " -larrow_gtest") + set(ARROW_GTEST_GTEST_HEADERS arrow::GTest::gtest_headers) set(ARROW_GTEST_GMOCK arrow::GTest::gmock) set(ARROW_GTEST_GTEST arrow::GTest::gtest) set(ARROW_GTEST_GTEST_MAIN arrow::GTest::gtest_main) diff --git a/dev/tasks/java-jars/github.yml b/dev/tasks/java-jars/github.yml index 58c1cedb11445..77e8867652e65 100644 --- a/dev/tasks/java-jars/github.yml +++ b/dev/tasks/java-jars/github.yml @@ -138,11 +138,6 @@ jobs: # used on test We uninstall Homebrew's Protobuf to ensure using # bundled Protobuf. brew uninstall protobuf - # We want to use the bundled googletest for static linking. Since - # both BUNDLED and brew options are enabled, it could cause a conflict - # when there is a version mismatch. - # We uninstall googletest to ensure using the bundled googletest. - brew uninstall googletest brew bundle --file=arrow/java/Brewfile - name: Build C++ libraries From fd69e5ef9f2d60e3b7bc7fc80208e04994834b9e Mon Sep 17 00:00:00 2001 From: Jin Chengcheng Date: Tue, 30 Jul 2024 13:58:41 +0800 Subject: [PATCH 41/73] GH-28866: [Java] Java Dataset API ScanOptions expansion (#41646) ### Rationale for this change ### What changes are included in this PR? Support to add ArrowSchema to specify C++ CsvFragmentScanOptions.convert_options.column_types And use Map to set the config, serialize in java and deserialize in C++ for CsvFragmentScanOptions ### Are these changes tested? new added UT. ### Are there any user-facing changes? No. * GitHub Issue: #28866 Authored-by: Chengcheng Jin Signed-off-by: David Li --- java/dataset/src/main/cpp/jni_wrapper.cc | 117 ++++++++++-- .../file/FileSystemDatasetFactory.java | 40 ++++- .../apache/arrow/dataset/file/JniWrapper.java | 12 +- .../apache/arrow/dataset/jni/JniWrapper.java | 4 + .../arrow/dataset/jni/NativeDataset.java | 11 +- .../dataset/scanner/FragmentScanOptions.java | 26 +++ .../arrow/dataset/scanner/ScanOptions.java | 21 +++ .../scanner/csv/CsvConvertOptions.java | 48 +++++ .../scanner/csv/CsvFragmentScanOptions.java | 93 ++++++++++ .../apache/arrow/dataset/utils/MapUtil.java | 43 +++++ .../dataset/TestFragmentScanOptions.java | 168 ++++++++++++++++++ .../src/test/resources/data/student.csv | 4 + 12 files changed, 566 insertions(+), 21 deletions(-) create mode 100644 java/dataset/src/main/java/org/apache/arrow/dataset/scanner/FragmentScanOptions.java create mode 100644 java/dataset/src/main/java/org/apache/arrow/dataset/scanner/csv/CsvConvertOptions.java create mode 100644 java/dataset/src/main/java/org/apache/arrow/dataset/scanner/csv/CsvFragmentScanOptions.java create mode 100644 java/dataset/src/main/java/org/apache/arrow/dataset/utils/MapUtil.java create mode 100644 java/dataset/src/test/java/org/apache/arrow/dataset/TestFragmentScanOptions.java create mode 100644 java/dataset/src/test/resources/data/student.csv diff --git a/java/dataset/src/main/cpp/jni_wrapper.cc b/java/dataset/src/main/cpp/jni_wrapper.cc index 4ef2a2ffd9206..f324f87d6c301 100644 --- a/java/dataset/src/main/cpp/jni_wrapper.cc +++ b/java/dataset/src/main/cpp/jni_wrapper.cc @@ -25,6 +25,9 @@ #include "arrow/c/helpers.h" #include "arrow/dataset/api.h" #include "arrow/dataset/file_base.h" +#ifdef ARROW_CSV +#include "arrow/dataset/file_csv.h" +#endif #include "arrow/filesystem/api.h" #include "arrow/filesystem/path_util.h" #include "arrow/engine/substrait/util.h" @@ -363,6 +366,63 @@ std::shared_ptr LoadArrowBufferFromByteBuffer(JNIEnv* env, jobjec return buffer; } +inline bool ParseBool(const std::string& value) { return value == "true" ? true : false; } + +/// \brief Construct FragmentScanOptions from config map +#ifdef ARROW_CSV +arrow::Result> +ToCsvFragmentScanOptions(const std::unordered_map& configs) { + std::shared_ptr options = + std::make_shared(); + for (auto const& [key, value] : configs) { + if (key == "delimiter") { + options->parse_options.delimiter = value.data()[0]; + } else if (key == "quoting") { + options->parse_options.quoting = ParseBool(value); + } else if (key == "column_types") { + int64_t schema_address = std::stol(value); + ArrowSchema* c_schema = reinterpret_cast(schema_address); + ARROW_ASSIGN_OR_RAISE(auto schema, arrow::ImportSchema(c_schema)); + auto& column_types = options->convert_options.column_types; + for (auto field : schema->fields()) { + column_types[field->name()] = field->type(); + } + } else if (key == "strings_can_be_null") { + options->convert_options.strings_can_be_null = ParseBool(value); + } else { + return arrow::Status::Invalid("Config " + key + " is not supported."); + } + } + return options; +} +#endif + +arrow::Result> +GetFragmentScanOptions(jint file_format_id, + const std::unordered_map& configs) { + switch (file_format_id) { +#ifdef ARROW_CSV + case 3: + return ToCsvFragmentScanOptions(configs); +#endif + default: + return arrow::Status::Invalid("Illegal file format id: ", file_format_id); + } +} + +std::unordered_map ToStringMap(JNIEnv* env, + jobjectArray& str_array) { + int length = env->GetArrayLength(str_array); + std::unordered_map map; + map.reserve(length / 2); + for (int i = 0; i < length; i += 2) { + auto key = reinterpret_cast(env->GetObjectArrayElement(str_array, i)); + auto value = reinterpret_cast(env->GetObjectArrayElement(str_array, i + 1)); + map[JStringToCString(env, key)] = JStringToCString(env, value); + } + return map; +} + /* * Class: org_apache_arrow_dataset_jni_NativeMemoryPool * Method: getDefaultMemoryPool @@ -501,12 +561,13 @@ JNIEXPORT void JNICALL Java_org_apache_arrow_dataset_jni_JniWrapper_closeDataset /* * Class: org_apache_arrow_dataset_jni_JniWrapper * Method: createScanner - * Signature: (J[Ljava/lang/String;Ljava/nio/ByteBuffer;Ljava/nio/ByteBuffer;JJ)J + * Signature: + * (J[Ljava/lang/String;Ljava/nio/ByteBuffer;Ljava/nio/ByteBuffer;JI;[Ljava/lang/String;J)J */ JNIEXPORT jlong JNICALL Java_org_apache_arrow_dataset_jni_JniWrapper_createScanner( JNIEnv* env, jobject, jlong dataset_id, jobjectArray columns, - jobject substrait_projection, jobject substrait_filter, - jlong batch_size, jlong memory_pool_id) { + jobject substrait_projection, jobject substrait_filter, jlong batch_size, + jint file_format_id, jobjectArray options, jlong memory_pool_id) { JNI_METHOD_START arrow::MemoryPool* pool = reinterpret_cast(memory_pool_id); if (pool == nullptr) { @@ -555,6 +616,12 @@ JNIEXPORT jlong JNICALL Java_org_apache_arrow_dataset_jni_JniWrapper_createScann } JniAssertOkOrThrow(scanner_builder->Filter(*filter_expr)); } + if (file_format_id != -1 && options != nullptr) { + std::unordered_map option_map = ToStringMap(env, options); + std::shared_ptr scan_options = + JniGetOrThrow(GetFragmentScanOptions(file_format_id, option_map)); + JniAssertOkOrThrow(scanner_builder->FragmentScanOptions(scan_options)); + } JniAssertOkOrThrow(scanner_builder->BatchSize(batch_size)); auto scanner = JniGetOrThrow(scanner_builder->Finish()); @@ -668,14 +735,29 @@ JNIEXPORT void JNICALL Java_org_apache_arrow_dataset_jni_JniWrapper_ensureS3Fina /* * Class: org_apache_arrow_dataset_file_JniWrapper * Method: makeFileSystemDatasetFactory - * Signature: (Ljava/lang/String;II)J + * Signature: (Ljava/lang/String;II;Ljava/lang/String;Ljava/lang/String)J */ JNIEXPORT jlong JNICALL -Java_org_apache_arrow_dataset_file_JniWrapper_makeFileSystemDatasetFactory__Ljava_lang_String_2I( - JNIEnv* env, jobject, jstring uri, jint file_format_id) { +Java_org_apache_arrow_dataset_file_JniWrapper_makeFileSystemDatasetFactory( + JNIEnv* env, jobject, jstring uri, jint file_format_id, jobjectArray options) { JNI_METHOD_START std::shared_ptr file_format = JniGetOrThrow(GetFileFormat(file_format_id)); + if (options != nullptr) { + std::unordered_map option_map = ToStringMap(env, options); + std::shared_ptr scan_options = + JniGetOrThrow(GetFragmentScanOptions(file_format_id, option_map)); + file_format->default_fragment_scan_options = scan_options; +#ifdef ARROW_CSV + if (file_format_id == 3) { + std::shared_ptr csv_file_format = + std::dynamic_pointer_cast(file_format); + csv_file_format->parse_options = + std::dynamic_pointer_cast(scan_options) + ->parse_options; + } +#endif + } arrow::dataset::FileSystemFactoryOptions options; std::shared_ptr d = JniGetOrThrow(arrow::dataset::FileSystemDatasetFactory::Make( @@ -686,16 +768,31 @@ Java_org_apache_arrow_dataset_file_JniWrapper_makeFileSystemDatasetFactory__Ljav /* * Class: org_apache_arrow_dataset_file_JniWrapper - * Method: makeFileSystemDatasetFactory - * Signature: ([Ljava/lang/String;II)J + * Method: makeFileSystemDatasetFactoryWithFiles + * Signature: ([Ljava/lang/String;II;[Ljava/lang/String)J */ JNIEXPORT jlong JNICALL -Java_org_apache_arrow_dataset_file_JniWrapper_makeFileSystemDatasetFactory___3Ljava_lang_String_2I( - JNIEnv* env, jobject, jobjectArray uris, jint file_format_id) { +Java_org_apache_arrow_dataset_file_JniWrapper_makeFileSystemDatasetFactoryWithFiles( + JNIEnv* env, jobject, jobjectArray uris, jint file_format_id, jobjectArray options) { JNI_METHOD_START std::shared_ptr file_format = JniGetOrThrow(GetFileFormat(file_format_id)); + if (options != nullptr) { + std::unordered_map option_map = ToStringMap(env, options); + std::shared_ptr scan_options = + JniGetOrThrow(GetFragmentScanOptions(file_format_id, option_map)); + file_format->default_fragment_scan_options = scan_options; +#ifdef ARROW_CSV + if (file_format_id == 3) { + std::shared_ptr csv_file_format = + std::dynamic_pointer_cast(file_format); + csv_file_format->parse_options = + std::dynamic_pointer_cast(scan_options) + ->parse_options; + } +#endif + } arrow::dataset::FileSystemFactoryOptions options; std::vector uri_vec = ToStringVector(env, uris); diff --git a/java/dataset/src/main/java/org/apache/arrow/dataset/file/FileSystemDatasetFactory.java b/java/dataset/src/main/java/org/apache/arrow/dataset/file/FileSystemDatasetFactory.java index 36ac6288af6d0..fcf124a61f812 100644 --- a/java/dataset/src/main/java/org/apache/arrow/dataset/file/FileSystemDatasetFactory.java +++ b/java/dataset/src/main/java/org/apache/arrow/dataset/file/FileSystemDatasetFactory.java @@ -16,8 +16,10 @@ */ package org.apache.arrow.dataset.file; +import java.util.Optional; import org.apache.arrow.dataset.jni.NativeDatasetFactory; import org.apache.arrow.dataset.jni.NativeMemoryPool; +import org.apache.arrow.dataset.scanner.FragmentScanOptions; import org.apache.arrow.memory.BufferAllocator; /** Java binding of the C++ FileSystemDatasetFactory. */ @@ -25,19 +27,45 @@ public class FileSystemDatasetFactory extends NativeDatasetFactory { public FileSystemDatasetFactory( BufferAllocator allocator, NativeMemoryPool memoryPool, FileFormat format, String uri) { - super(allocator, memoryPool, createNative(format, uri)); + super(allocator, memoryPool, createNative(format, uri, Optional.empty())); + } + + public FileSystemDatasetFactory( + BufferAllocator allocator, + NativeMemoryPool memoryPool, + FileFormat format, + String uri, + Optional fragmentScanOptions) { + super(allocator, memoryPool, createNative(format, uri, fragmentScanOptions)); } public FileSystemDatasetFactory( BufferAllocator allocator, NativeMemoryPool memoryPool, FileFormat format, String[] uris) { - super(allocator, memoryPool, createNative(format, uris)); + super(allocator, memoryPool, createNative(format, uris, Optional.empty())); + } + + public FileSystemDatasetFactory( + BufferAllocator allocator, + NativeMemoryPool memoryPool, + FileFormat format, + String[] uris, + Optional fragmentScanOptions) { + super(allocator, memoryPool, createNative(format, uris, fragmentScanOptions)); } - private static long createNative(FileFormat format, String uri) { - return JniWrapper.get().makeFileSystemDatasetFactory(uri, format.id()); + private static long createNative( + FileFormat format, String uri, Optional fragmentScanOptions) { + return JniWrapper.get() + .makeFileSystemDatasetFactory( + uri, format.id(), fragmentScanOptions.map(FragmentScanOptions::serialize).orElse(null)); } - private static long createNative(FileFormat format, String[] uris) { - return JniWrapper.get().makeFileSystemDatasetFactory(uris, format.id()); + private static long createNative( + FileFormat format, String[] uris, Optional fragmentScanOptions) { + return JniWrapper.get() + .makeFileSystemDatasetFactoryWithFiles( + uris, + format.id(), + fragmentScanOptions.map(FragmentScanOptions::serialize).orElse(null)); } } diff --git a/java/dataset/src/main/java/org/apache/arrow/dataset/file/JniWrapper.java b/java/dataset/src/main/java/org/apache/arrow/dataset/file/JniWrapper.java index dfac293ccb588..d2f842f99e588 100644 --- a/java/dataset/src/main/java/org/apache/arrow/dataset/file/JniWrapper.java +++ b/java/dataset/src/main/java/org/apache/arrow/dataset/file/JniWrapper.java @@ -37,22 +37,26 @@ private JniWrapper() {} * intermediate shared_ptr of the factory instance. * * @param uri file uri to read, either a file or a directory - * @param fileFormat file format ID + * @param fileFormat file format ID. + * @param serializedFragmentScanOptions serialized FragmentScanOptions. * @return the native pointer of the arrow::dataset::FileSystemDatasetFactory instance. * @see FileFormat */ - public native long makeFileSystemDatasetFactory(String uri, int fileFormat); + public native long makeFileSystemDatasetFactory( + String uri, int fileFormat, String[] serializedFragmentScanOptions); /** * Create FileSystemDatasetFactory and return its native pointer. The pointer is pointing to a * intermediate shared_ptr of the factory instance. * * @param uris List of file uris to read, each path pointing to an individual file - * @param fileFormat file format ID + * @param fileFormat file format ID. + * @param serializedFragmentScanOptions serialized FragmentScanOptions. * @return the native pointer of the arrow::dataset::FileSystemDatasetFactory instance. * @see FileFormat */ - public native long makeFileSystemDatasetFactory(String[] uris, int fileFormat); + public native long makeFileSystemDatasetFactoryWithFiles( + String[] uris, int fileFormat, String[] serializedFragmentScanOptions); /** * Write the content in a {@link org.apache.arrow.c.ArrowArrayStream} into files. This internally diff --git a/java/dataset/src/main/java/org/apache/arrow/dataset/jni/JniWrapper.java b/java/dataset/src/main/java/org/apache/arrow/dataset/jni/JniWrapper.java index b5aa3d918acd9..6637c113d9edc 100644 --- a/java/dataset/src/main/java/org/apache/arrow/dataset/jni/JniWrapper.java +++ b/java/dataset/src/main/java/org/apache/arrow/dataset/jni/JniWrapper.java @@ -71,6 +71,8 @@ private JniWrapper() {} * @param substraitProjection substrait extended expression to evaluate for project new columns * @param substraitFilter substrait extended expression to evaluate for apply filter * @param batchSize batch size of scanned record batches. + * @param fileFormat file format ID. + * @param serializedFragmentScanOptions serialized FragmentScanOptions. * @param memoryPool identifier of memory pool used in the native scanner. * @return the native pointer of the arrow::dataset::Scanner instance. */ @@ -80,6 +82,8 @@ public native long createScanner( ByteBuffer substraitProjection, ByteBuffer substraitFilter, long batchSize, + int fileFormat, + String[] serializedFragmentScanOptions, long memoryPool); /** diff --git a/java/dataset/src/main/java/org/apache/arrow/dataset/jni/NativeDataset.java b/java/dataset/src/main/java/org/apache/arrow/dataset/jni/NativeDataset.java index 83a9ff1f32243..8f8cdc49d4877 100644 --- a/java/dataset/src/main/java/org/apache/arrow/dataset/jni/NativeDataset.java +++ b/java/dataset/src/main/java/org/apache/arrow/dataset/jni/NativeDataset.java @@ -16,6 +16,7 @@ */ package org.apache.arrow.dataset.jni; +import org.apache.arrow.dataset.scanner.FragmentScanOptions; import org.apache.arrow.dataset.scanner.ScanOptions; import org.apache.arrow.dataset.source.Dataset; @@ -37,7 +38,13 @@ public synchronized NativeScanner newScan(ScanOptions options) { if (closed) { throw new NativeInstanceReleasedException(); } - + int fileFormatId = -1; + String[] serialized = null; + if (options.getFragmentScanOptions().isPresent()) { + FragmentScanOptions fragmentScanOptions = options.getFragmentScanOptions().get(); + fileFormatId = fragmentScanOptions.fileFormat().id(); + serialized = fragmentScanOptions.serialize(); + } long scannerId = JniWrapper.get() .createScanner( @@ -46,6 +53,8 @@ public synchronized NativeScanner newScan(ScanOptions options) { options.getSubstraitProjection().orElse(null), options.getSubstraitFilter().orElse(null), options.getBatchSize(), + fileFormatId, + serialized, context.getMemoryPool().getNativeInstanceId()); return new NativeScanner(context, scannerId); diff --git a/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/FragmentScanOptions.java b/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/FragmentScanOptions.java new file mode 100644 index 0000000000000..d48d0bd2b76b9 --- /dev/null +++ b/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/FragmentScanOptions.java @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.dataset.scanner; + +import org.apache.arrow.dataset.file.FileFormat; + +/** The file fragment scan options interface. It is used to transfer to JNI call. */ +public interface FragmentScanOptions { + FileFormat fileFormat(); + + String[] serialize(); +} diff --git a/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/ScanOptions.java b/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/ScanOptions.java index 837016ad1e9d1..68fc3943b3edd 100644 --- a/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/ScanOptions.java +++ b/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/ScanOptions.java @@ -27,6 +27,8 @@ public class ScanOptions { private final Optional substraitProjection; private final Optional substraitFilter; + private final Optional fragmentScanOptions; + /** * Constructor. * @@ -65,6 +67,7 @@ public ScanOptions(long batchSize, Optional columns) { this.columns = columns; this.substraitProjection = Optional.empty(); this.substraitFilter = Optional.empty(); + this.fragmentScanOptions = Optional.empty(); } public ScanOptions(long batchSize) { @@ -87,12 +90,17 @@ public Optional getSubstraitFilter() { return substraitFilter; } + public Optional getFragmentScanOptions() { + return fragmentScanOptions; + } + /** Builder for Options used during scanning. */ public static class Builder { private final long batchSize; private Optional columns; private ByteBuffer substraitProjection; private ByteBuffer substraitFilter; + private FragmentScanOptions fragmentScanOptions; /** * Constructor. @@ -140,6 +148,18 @@ public Builder substraitFilter(ByteBuffer substraitFilter) { return this; } + /** + * Set the FragmentScanOptions. + * + * @param fragmentScanOptions fragment scan options + * @return the ScanOptions configured. + */ + public Builder fragmentScanOptions(FragmentScanOptions fragmentScanOptions) { + Preconditions.checkNotNull(fragmentScanOptions); + this.fragmentScanOptions = fragmentScanOptions; + return this; + } + public ScanOptions build() { return new ScanOptions(this); } @@ -150,5 +170,6 @@ private ScanOptions(Builder builder) { columns = builder.columns; substraitProjection = Optional.ofNullable(builder.substraitProjection); substraitFilter = Optional.ofNullable(builder.substraitFilter); + fragmentScanOptions = Optional.ofNullable(builder.fragmentScanOptions); } } diff --git a/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/csv/CsvConvertOptions.java b/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/csv/CsvConvertOptions.java new file mode 100644 index 0000000000000..15e257896b80e --- /dev/null +++ b/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/csv/CsvConvertOptions.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.dataset.scanner.csv; + +import java.util.Map; +import java.util.Optional; +import org.apache.arrow.c.ArrowSchema; + +public class CsvConvertOptions { + + private final Map configs; + + private Optional cSchema = Optional.empty(); + + public CsvConvertOptions(Map configs) { + this.configs = configs; + } + + public Optional getArrowSchema() { + return cSchema; + } + + public Map getConfigs() { + return configs; + } + + public void set(String key, String value) { + configs.put(key, value); + } + + public void setArrowSchema(ArrowSchema cSchema) { + this.cSchema = Optional.of(cSchema); + } +} diff --git a/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/csv/CsvFragmentScanOptions.java b/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/csv/CsvFragmentScanOptions.java new file mode 100644 index 0000000000000..39271b5f063fb --- /dev/null +++ b/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/csv/CsvFragmentScanOptions.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.dataset.scanner.csv; + +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import org.apache.arrow.dataset.file.FileFormat; +import org.apache.arrow.dataset.scanner.FragmentScanOptions; +import org.apache.arrow.dataset.utils.MapUtil; + +public class CsvFragmentScanOptions implements FragmentScanOptions { + private final CsvConvertOptions convertOptions; + private final Map readOptions; + private final Map parseOptions; + + /** + * CSV scan options, map to CPP struct CsvFragmentScanOptions. The key in config map is the field + * name of mapping cpp struct + * + * @param convertOptions similar to CsvFragmentScanOptions#convert_options in CPP, the ArrowSchema + * represents column_types, convert data option such as null value recognition. + * @param readOptions similar to CsvFragmentScanOptions#read_options in CPP, specify how to read + * the file such as block_size + * @param parseOptions similar to CsvFragmentScanOptions#parse_options in CPP, parse file option + * such as delimiter + */ + public CsvFragmentScanOptions( + CsvConvertOptions convertOptions, + Map readOptions, + Map parseOptions) { + this.convertOptions = convertOptions; + this.readOptions = readOptions; + this.parseOptions = parseOptions; + } + + /** + * File format. + * + * @return file format. + */ + @Override + public FileFormat fileFormat() { + return FileFormat.CSV; + } + + /** + * This is an internal function to invoke by serializer. Serialize this class to string array and + * then called by JNI call. + * + * @return string array as Map JNI bridge format. + */ + @Override + public String[] serialize() { + Map options = + Stream.concat( + Stream.concat(readOptions.entrySet().stream(), parseOptions.entrySet().stream()), + convertOptions.getConfigs().entrySet().stream()) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + + if (convertOptions.getArrowSchema().isPresent()) { + options.put( + "column_types", Long.toString(convertOptions.getArrowSchema().get().memoryAddress())); + } + return MapUtil.convertMapToStringArray(options); + } + + public CsvConvertOptions getConvertOptions() { + return convertOptions; + } + + public Map getReadOptions() { + return readOptions; + } + + public Map getParseOptions() { + return parseOptions; + } +} diff --git a/java/dataset/src/main/java/org/apache/arrow/dataset/utils/MapUtil.java b/java/dataset/src/main/java/org/apache/arrow/dataset/utils/MapUtil.java new file mode 100644 index 0000000000000..4df6cf1e0e05e --- /dev/null +++ b/java/dataset/src/main/java/org/apache/arrow/dataset/utils/MapUtil.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.dataset.utils; + +import java.util.Map; + +/** The utility class for Map. */ +public class MapUtil { + private MapUtil() {} + + /** + * Convert the map to string array as JNI bridge. + * + * @param config config map + * @return string array for serialization + */ + public static String[] convertMapToStringArray(Map config) { + if (config.isEmpty()) { + return null; + } + String[] configs = new String[config.size() * 2]; + int i = 0; + for (Map.Entry entry : config.entrySet()) { + configs[i++] = entry.getKey(); + configs[i++] = entry.getValue(); + } + return configs; + } +} diff --git a/java/dataset/src/test/java/org/apache/arrow/dataset/TestFragmentScanOptions.java b/java/dataset/src/test/java/org/apache/arrow/dataset/TestFragmentScanOptions.java new file mode 100644 index 0000000000000..9787e8308e73e --- /dev/null +++ b/java/dataset/src/test/java/org/apache/arrow/dataset/TestFragmentScanOptions.java @@ -0,0 +1,168 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.dataset; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.junit.jupiter.api.Assertions.assertEquals; + +import com.google.common.collect.ImmutableMap; +import java.util.Arrays; +import java.util.Collections; +import java.util.Optional; +import org.apache.arrow.c.ArrowSchema; +import org.apache.arrow.c.CDataDictionaryProvider; +import org.apache.arrow.c.Data; +import org.apache.arrow.dataset.file.FileFormat; +import org.apache.arrow.dataset.file.FileSystemDatasetFactory; +import org.apache.arrow.dataset.jni.NativeMemoryPool; +import org.apache.arrow.dataset.scanner.ScanOptions; +import org.apache.arrow.dataset.scanner.Scanner; +import org.apache.arrow.dataset.scanner.csv.CsvConvertOptions; +import org.apache.arrow.dataset.scanner.csv.CsvFragmentScanOptions; +import org.apache.arrow.dataset.source.Dataset; +import org.apache.arrow.dataset.source.DatasetFactory; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.ValueIterableVector; +import org.apache.arrow.vector.ipc.ArrowReader; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.Schema; +import org.hamcrest.collection.IsIterableContainingInOrder; +import org.junit.jupiter.api.Test; + +public class TestFragmentScanOptions { + + @Test + public void testCsvConvertOptions() throws Exception { + final Schema schema = + new Schema( + Arrays.asList( + Field.nullable("Id", new ArrowType.Int(32, true)), + Field.nullable("Name", new ArrowType.Utf8()), + Field.nullable("Language", new ArrowType.Utf8())), + null); + String path = "file://" + getClass().getResource("/").getPath() + "/data/student.csv"; + BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); + try (ArrowSchema cSchema = ArrowSchema.allocateNew(allocator); + CDataDictionaryProvider provider = new CDataDictionaryProvider()) { + Data.exportSchema(allocator, schema, provider, cSchema); + CsvConvertOptions convertOptions = new CsvConvertOptions(ImmutableMap.of("delimiter", ";")); + convertOptions.setArrowSchema(cSchema); + CsvFragmentScanOptions fragmentScanOptions = + new CsvFragmentScanOptions(convertOptions, ImmutableMap.of(), ImmutableMap.of()); + ScanOptions options = + new ScanOptions.Builder(/*batchSize*/ 32768) + .columns(Optional.empty()) + .fragmentScanOptions(fragmentScanOptions) + .build(); + try (DatasetFactory datasetFactory = + new FileSystemDatasetFactory( + allocator, NativeMemoryPool.getDefault(), FileFormat.CSV, path); + Dataset dataset = datasetFactory.finish(); + Scanner scanner = dataset.newScan(options); + ArrowReader reader = scanner.scanBatches()) { + + assertEquals(schema.getFields(), reader.getVectorSchemaRoot().getSchema().getFields()); + int rowCount = 0; + while (reader.loadNextBatch()) { + final ValueIterableVector idVector = + (ValueIterableVector) reader.getVectorSchemaRoot().getVector("Id"); + assertThat(idVector.getValueIterable(), IsIterableContainingInOrder.contains(1, 2, 3)); + rowCount += reader.getVectorSchemaRoot().getRowCount(); + } + assertEquals(3, rowCount); + } + } + } + + @Test + public void testCsvConvertOptionsDelimiterNotSet() throws Exception { + final Schema schema = + new Schema( + Arrays.asList( + Field.nullable("Id", new ArrowType.Int(32, true)), + Field.nullable("Name", new ArrowType.Utf8()), + Field.nullable("Language", new ArrowType.Utf8())), + null); + String path = "file://" + getClass().getResource("/").getPath() + "/data/student.csv"; + BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); + try (ArrowSchema cSchema = ArrowSchema.allocateNew(allocator); + CDataDictionaryProvider provider = new CDataDictionaryProvider()) { + Data.exportSchema(allocator, schema, provider, cSchema); + CsvConvertOptions convertOptions = new CsvConvertOptions(ImmutableMap.of()); + convertOptions.setArrowSchema(cSchema); + CsvFragmentScanOptions fragmentScanOptions = + new CsvFragmentScanOptions(convertOptions, ImmutableMap.of(), ImmutableMap.of()); + ScanOptions options = + new ScanOptions.Builder(/*batchSize*/ 32768) + .columns(Optional.empty()) + .fragmentScanOptions(fragmentScanOptions) + .build(); + try (DatasetFactory datasetFactory = + new FileSystemDatasetFactory( + allocator, NativeMemoryPool.getDefault(), FileFormat.CSV, path); + Dataset dataset = datasetFactory.finish(); + Scanner scanner = dataset.newScan(options); + ArrowReader reader = scanner.scanBatches()) { + + assertEquals(schema.getFields(), reader.getVectorSchemaRoot().getSchema().getFields()); + int rowCount = 0; + while (reader.loadNextBatch()) { + final ValueIterableVector idVector = + (ValueIterableVector) reader.getVectorSchemaRoot().getVector("Id"); + assertThat(idVector.getValueIterable(), IsIterableContainingInOrder.contains(1, 2, 3)); + rowCount += reader.getVectorSchemaRoot().getRowCount(); + } + assertEquals(3, rowCount); + } + } + } + + @Test + public void testCsvConvertOptionsNoOption() throws Exception { + final Schema schema = + new Schema( + Collections.singletonList(Field.nullable("Id;Name;Language", new ArrowType.Utf8())), + null); + String path = "file://" + getClass().getResource("/").getPath() + "/data/student.csv"; + BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); + ScanOptions options = + new ScanOptions.Builder(/*batchSize*/ 32768).columns(Optional.empty()).build(); + try (DatasetFactory datasetFactory = + new FileSystemDatasetFactory( + allocator, NativeMemoryPool.getDefault(), FileFormat.CSV, path); + Dataset dataset = datasetFactory.finish(); + Scanner scanner = dataset.newScan(options); + ArrowReader reader = scanner.scanBatches()) { + + assertEquals(schema.getFields(), reader.getVectorSchemaRoot().getSchema().getFields()); + int rowCount = 0; + while (reader.loadNextBatch()) { + final ValueIterableVector idVector = + (ValueIterableVector) + reader.getVectorSchemaRoot().getVector("Id;Name;Language"); + assertThat( + idVector.getValueIterable(), + IsIterableContainingInOrder.contains( + "1;Juno;Java\n" + "2;Peter;Python\n" + "3;Celin;C++")); + rowCount += reader.getVectorSchemaRoot().getRowCount(); + } + assertEquals(3, rowCount); + } + } +} diff --git a/java/dataset/src/test/resources/data/student.csv b/java/dataset/src/test/resources/data/student.csv new file mode 100644 index 0000000000000..3291946092156 --- /dev/null +++ b/java/dataset/src/test/resources/data/student.csv @@ -0,0 +1,4 @@ +Id;Name;Language +1;Juno;Java +2;Peter;Python +3;Celin;C++ From ff8a771c43be18e93f6321e5aa9b1acb68a4f6c4 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 30 Jul 2024 15:07:59 +0900 Subject: [PATCH 42/73] MINOR: [Java] Bump com.google.api.grpc:proto-google-common-protos from 2.40.0 to 2.42.0 in /java (#43473) Bumps [com.google.api.grpc:proto-google-common-protos](https://github.com/googleapis/sdk-platform-java) from 2.40.0 to 2.42.0.
Release notes

Sourced from com.google.api.grpc:proto-google-common-protos's releases.

v2.42.0

2.42.0 (2024-06-25)

Features

  • Allow Adding Client Level Attributes to MetricsTracerFactory (#2614) (f122c6f)
  • gapic-generator-java to perform a no-op when no services are detected (#2460) (c0b5646)
  • Make Layout Parser generally available in V1 (e508ae6)
  • populate .repo-metadata.json from highest version (#2890) (f587541)
  • push SNAPSHOT versions of the hermetic build docker image (#2888) (81df866)

Bug Fixes

  • deps: update the Java code generator (gapic-generator-java) to 1.2.3 (e508ae6)
  • Expose Gax meter name (#2865) (6c5d6ce)
  • Move the logic of getting systemProductName from static block to static method (#2874) (536f1eb)
  • Update default Otel Attribute from method_name to method (#2833) (af10a9e)

Dependencies

  • update dependency com.google.auto.value:auto-value to v1.11.0 (#2842) (dd27fdf)
  • update dependency com.google.auto.value:auto-value-annotations to v1.11.0 (#2843) (bf8e67f)
  • update dependency com.google.cloud:grpc-gcp to v1.6.1 (#2943) (9f16b40)
  • update dependency org.checkerframework:checker-qual to v3.44.0 (#2848) (7a99c50)
  • update dependency org.easymock:easymock to v5.3.0 (#2871) (c243f7d)
  • update google api dependencies (#2846) (b5ef698)
  • update googleapis/java-cloud-bom digest to 17cc5ec (#2882) (d6abd8e)
  • update netty dependencies to v4.1.111.final (#2877) (b5f10b9)
  • update opentelemetry-java monorepo to v1.39.0 (#2863) (9d1f3a8)

v2.41.0

2.41.0 (2024-05-31)

Features

Bug Fixes

Dependencies

... (truncated)

Changelog

Sourced from com.google.api.grpc:proto-google-common-protos's changelog.

2.42.0 (2024-06-25)

Features

  • Allow Adding Client Level Attributes to MetricsTracerFactory (#2614) (f122c6f)
  • gapic-generator-java to perform a no-op when no services are detected (#2460) (c0b5646)
  • Make Layout Parser generally available in V1 (e508ae6)
  • populate .repo-metadata.json from highest version (#2890) (f587541)
  • push SNAPSHOT versions of the hermetic build docker image (#2888) (81df866)

Bug Fixes

  • deps: update the Java code generator (gapic-generator-java) to 1.2.3 (e508ae6)
  • Expose Gax meter name (#2865) (6c5d6ce)
  • Move the logic of getting systemProductName from static block to static method (#2874) (536f1eb)
  • Update default Otel Attribute from method_name to method (#2833) (af10a9e)

Dependencies

  • update dependency com.google.auto.value:auto-value to v1.11.0 (#2842) (dd27fdf)
  • update dependency com.google.auto.value:auto-value-annotations to v1.11.0 (#2843) (bf8e67f)
  • update dependency com.google.cloud:grpc-gcp to v1.6.1 (#2943) (9f16b40)
  • update dependency org.checkerframework:checker-qual to v3.44.0 (#2848) (7a99c50)
  • update dependency org.easymock:easymock to v5.3.0 (#2871) (c243f7d)
  • update google api dependencies (#2846) (b5ef698)
  • update googleapis/java-cloud-bom digest to 17cc5ec (#2882) (d6abd8e)
  • update netty dependencies to v4.1.111.final (#2877) (b5f10b9)
  • update opentelemetry-java monorepo to v1.39.0 (#2863) (9d1f3a8)

2.41.0 (2024-05-31)

Features

Bug Fixes

Dependencies

  • update dependency com.google.api-client:google-api-client-bom to v2.6.0 (#2782) (5bc8928)
  • update dependency com.google.cloud.opentelemetry:detector-resources-support to v0.29.0 (#2831) (6c1dbfc)

... (truncated)

Commits
  • 4f5c4d9 chore(main): release 2.42.0 (#2847)
  • f15df3f build(deps): update dependency org.apache.maven.plugins:maven-surefire-plugin...
  • e4f1217 build(deps): update dependency org.apache.maven.plugins:maven-project-info-re...
  • d3960d9 build(deps): update dependency com.google.cloud:google-cloud-shared-config to...
  • 9f16b40 deps: update dependency com.google.cloud:grpc-gcp to v1.6.1 (#2943)
  • c42fce7 build(deps): update dependency org.apache.maven.plugins:maven-failsafe-plugin...
  • d6abd8e deps: update googleapis/java-cloud-bom digest to 17cc5ec (#2882)
  • c243f7d deps: update dependency org.easymock:easymock to v5.3.0 (#2871)
  • b5f10b9 deps: update netty dependencies to v4.1.111.final (#2877)
  • 9d1f3a8 deps: update opentelemetry-java monorepo to v1.39.0 (#2863)
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=com.google.api.grpc:proto-google-common-protos&package-manager=maven&previous-version=2.40.0&new-version=2.42.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: David Li --- java/flight/flight-core/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/flight/flight-core/pom.xml b/java/flight/flight-core/pom.xml index be3c191654a58..e4d1d5d3885a5 100644 --- a/java/flight/flight-core/pom.xml +++ b/java/flight/flight-core/pom.xml @@ -134,7 +134,7 @@ under the License. com.google.api.grpc proto-google-common-protos - 2.40.0 + 2.42.0 test From 49bad4e2d4307aa7617c640bfbf880488a84c9bd Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 30 Jul 2024 15:20:08 +0900 Subject: [PATCH 43/73] MINOR: [Java] Bump org.codehaus.mojo:versions-maven-plugin from 2.17.0 to 2.17.1 in /java (#43472) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [org.codehaus.mojo:versions-maven-plugin](https://github.com/mojohaus/versions) from 2.17.0 to 2.17.1.
Release notes

Sourced from org.codehaus.mojo:versions-maven-plugin's releases.

2.17.1

Changes

🐛 Bug Fixes

📦 Dependency updates

👻 Maintenance

Commits
  • 0b79444 [maven-release-plugin] prepare release 2.17.1
  • 2efe05f Extract Pattern.compile to static variable
  • 8321211 Revert #1051 - fix #1110, #1103
  • 9fb2951 Bump byteBuddyVersion from 1.14.17 to 1.14.18
  • 65309e8 Improve ITs for set goal
  • 2a696ac Remove default values from invoker.properties
  • 1383503 Bump org.codehaus.plexus:plexus-archiver from 4.9.2 to 4.10.0
  • 9e445db Remove default value for invoker.buildResult
  • 5ddc194 Improve ITs for set goal
  • 13242ee Fix includeParent field value not matching Parameter#defaultValue
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=org.codehaus.mojo:versions-maven-plugin&package-manager=maven&previous-version=2.17.0&new-version=2.17.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: David Li --- java/bom/pom.xml | 2 +- java/pom.xml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/java/bom/pom.xml b/java/bom/pom.xml index 1f8585c801330..ce05f25bc6c9e 100644 --- a/java/bom/pom.xml +++ b/java/bom/pom.xml @@ -208,7 +208,7 @@ under the License. org.codehaus.mojo versions-maven-plugin - 2.17.0 + 2.17.1 diff --git a/java/pom.xml b/java/pom.xml index 1ed263d7db878..c62ae332b1065 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -508,7 +508,7 @@ under the License. org.codehaus.mojo versions-maven-plugin - 2.17.0 + 2.17.1 pl.project13.maven From f00a3061fd5e54b05387cdbd7499ade5f0f011e0 Mon Sep 17 00:00:00 2001 From: Anja Kefala Date: Tue, 30 Jul 2024 02:23:43 -0700 Subject: [PATCH 44/73] GH-43391: [Python] Add bindings for memory manager and device to Context class (#43392) ### What changes are included in this PR? Added bindings for `device` and `memory_manager` to `pyarrow.cuda.Context class`. ### Are these changes tested? Yes ### Are there any user-facing changes? Yes * GitHub Issue: #43391 Lead-authored-by: anjakefala Co-authored-by: Anja Kefala Co-authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- python/pyarrow/_cuda.pyx | 22 ++++++++++++++++++++++ python/pyarrow/includes/libarrow_cuda.pxd | 2 ++ python/pyarrow/tests/test_cuda.py | 11 +++++++++++ 3 files changed, 35 insertions(+) diff --git a/python/pyarrow/_cuda.pyx b/python/pyarrow/_cuda.pyx index ba799a105e7e1..5aed9f8a28518 100644 --- a/python/pyarrow/_cuda.pyx +++ b/python/pyarrow/_cuda.pyx @@ -185,6 +185,28 @@ cdef class Context(_Weakrefable): cudabuf = GetResultValue(self.context.get().Allocate(nbytes)) return pyarrow_wrap_cudabuffer(cudabuf) + @property + def memory_manager(self): + """ + The default memory manager tied to this context's device. + + Returns + ------- + MemoryManager + """ + return MemoryManager.wrap(self.context.get().memory_manager()) + + @property + def device(self): + """ + The device instance associated with this context. + + Returns + ------- + Device + """ + return Device.wrap(self.context.get().device()) + def foreign_buffer(self, address, size, base=None): """ Create device buffer from address and size as a view. diff --git a/python/pyarrow/includes/libarrow_cuda.pxd b/python/pyarrow/includes/libarrow_cuda.pxd index 3ac943cf941d8..39ca53908a68b 100644 --- a/python/pyarrow/includes/libarrow_cuda.pxd +++ b/python/pyarrow/includes/libarrow_cuda.pxd @@ -41,6 +41,8 @@ cdef extern from "arrow/gpu/cuda_api.h" namespace "arrow::cuda" nogil: const void* handle() const int device_number() const CResult[uintptr_t] GetDeviceAddress(uintptr_t addr) + shared_ptr[CDevice] device() const + shared_ptr[CMemoryManager] memory_manager() const cdef cppclass CCudaIpcMemHandle" arrow::cuda::CudaIpcMemHandle": @staticmethod diff --git a/python/pyarrow/tests/test_cuda.py b/python/pyarrow/tests/test_cuda.py index 5e833d5371de2..61f784a729f73 100644 --- a/python/pyarrow/tests/test_cuda.py +++ b/python/pyarrow/tests/test_cuda.py @@ -57,6 +57,17 @@ def test_Context(): assert global_context.device_number == 0 assert global_context1.device_number == cuda.Context.get_num_devices() - 1 + mm = global_context.memory_manager + assert not mm.is_cpu + assert " Date: Tue, 30 Jul 2024 09:21:58 -0400 Subject: [PATCH 45/73] MINOR: [Java] Bump com.puppycrawl.tools:checkstyle from 8.29 to 10.17.0 in /java (#43311) ### Rationale for this change Now that Java 8 is deprecated, we can bump checkstyle to latest version. ### What changes are included in this PR? * Bump checkstyle to 10.17.0 * Update checkstyle.xml to fix backwards-breaking changes * Fix new check errors ### Are these changes tested? CI ### Are there any user-facing changes? No Authored-by: Dane Pitkin Signed-off-by: Dane Pitkin --- java/.gitattributes | 1 + java/dev/checkstyle/checkstyle.xml | 2 +- .../jdbc/utils/VectorSchemaRootTransformer.java | 11 +++++++++++ java/pom.xml | 3 ++- .../org/apache/arrow/vector/BaseFixedWidthVector.java | 2 +- .../arrow/vector/BaseLargeVariableWidthVector.java | 2 +- .../apache/arrow/vector/BaseVariableWidthVector.java | 2 +- .../arrow/vector/BaseVariableWidthViewVector.java | 2 +- .../org/apache/arrow/vector/ExtensionTypeVector.java | 2 +- .../java/org/apache/arrow/vector/FieldVector.java | 2 +- .../main/java/org/apache/arrow/vector/NullVector.java | 2 +- .../arrow/vector/complex/BaseRepeatedValueVector.java | 2 +- .../apache/arrow/vector/complex/LargeListVector.java | 4 ++-- .../org/apache/arrow/vector/complex/ListVector.java | 2 +- .../arrow/vector/complex/RepeatedValueVector.java | 2 +- .../org/apache/arrow/vector/complex/StructVector.java | 2 +- 16 files changed, 28 insertions(+), 15 deletions(-) diff --git a/java/.gitattributes b/java/.gitattributes index 596615322fb3e..366d3c2b3cdf6 100644 --- a/java/.gitattributes +++ b/java/.gitattributes @@ -1,2 +1,3 @@ .gitattributes export-ignore .gitignore export-ignore +* text=auto eol=lf diff --git a/java/dev/checkstyle/checkstyle.xml b/java/dev/checkstyle/checkstyle.xml index eb63c3ff0fc61..4b546ac56ea23 100644 --- a/java/dev/checkstyle/checkstyle.xml +++ b/java/dev/checkstyle/checkstyle.xml @@ -180,7 +180,7 @@ - + diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/VectorSchemaRootTransformer.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/VectorSchemaRootTransformer.java index b651e1eb9bcae..b3c7a1ee5c6c1 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/VectorSchemaRootTransformer.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/VectorSchemaRootTransformer.java @@ -44,6 +44,12 @@ class Builder { private final List newFields = new ArrayList<>(); private final Collection tasks = new ArrayList<>(); + /** + * Constructor for the VectorSchemaRootTransformer's Builder. + * + * @param schema The Arrow schema. + * @param bufferAllocator The BufferAllocator to use for allocating memory. + */ public Builder(final Schema schema, final BufferAllocator bufferAllocator) { this.schema = schema; this.bufferAllocator = @@ -127,6 +133,11 @@ public Builder addEmptyField(final String fieldName, final ArrowType fieldType) return this; } + /** + * Build the {@link VectorSchemaRoot} with applied transformation tasks. + * + * @return The built {@link VectorSchemaRoot}. + */ public VectorSchemaRootTransformer build() { return (originalRoot, transformedRoot) -> { if (transformedRoot == null) { diff --git a/java/pom.xml b/java/pom.xml index c62ae332b1065..838e60b037261 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -105,6 +105,7 @@ under the License. 1.11.3 2 + 10.17.0 true 2.29.2 5.11.0 @@ -719,7 +720,7 @@ under the License. com.puppycrawl.tools checkstyle - 8.29 + ${checkstyle.version} org.slf4j diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java index 50ddf30bf7e7c..387033f0d2f7b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java @@ -453,9 +453,9 @@ public void reAlloc() { /** * Get the inner vectors. * + * @return the inner vectors for this field as defined by the TypeLayout * @deprecated This API will be removed as the current implementations no longer support inner * vectors. - * @return the inner vectors for this field as defined by the TypeLayout */ @Deprecated @Override diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java index 3a177f84c9853..552a896ea8c36 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java @@ -259,9 +259,9 @@ public void clear() { /** * Get the inner vectors. * + * @return the inner vectors for this field as defined by the TypeLayout * @deprecated This API will be removed as the current implementations no longer support inner * vectors. - * @return the inner vectors for this field as defined by the TypeLayout */ @Override @Deprecated diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java index 5d761ffbee919..aaccec602f292 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java @@ -275,9 +275,9 @@ public void clear() { /** * Get the inner vectors. * + * @return the inner vectors for this field as defined by the TypeLayout * @deprecated This API will be removed as the current implementations no longer support inner * vectors. - * @return the inner vectors for this field as defined by the TypeLayout */ @Deprecated @Override diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthViewVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthViewVector.java index f0c84bd410640..aee5233f9d466 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthViewVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthViewVector.java @@ -315,9 +315,9 @@ public void clearDataBuffers() { /** * Get the inner vectors. * + * @return the inner vectors for this field as defined by the TypeLayout * @deprecated This API will be removed as the current implementations no longer support inner * vectors. - * @return the inner vectors for this field as defined by the TypeLayout */ @Deprecated @Override diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ExtensionTypeVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ExtensionTypeVector.java index 208c8b416cf91..3762fecd0bdcc 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ExtensionTypeVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ExtensionTypeVector.java @@ -233,9 +233,9 @@ public List getFieldBuffers() { /** * Get the inner vectors. * + * @return the inner vectors for this field as defined by the TypeLayout * @deprecated This API will be removed as the current implementations no longer support inner * vectors. - * @return the inner vectors for this field as defined by the TypeLayout */ @Deprecated @Override diff --git a/java/vector/src/main/java/org/apache/arrow/vector/FieldVector.java b/java/vector/src/main/java/org/apache/arrow/vector/FieldVector.java index e58f7bba84494..391ef778169f5 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/FieldVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/FieldVector.java @@ -111,9 +111,9 @@ default void exportCDataBuffers(List buffers, ArrowBuf buffersPtr, lon /** * Get the inner vectors. * + * @return the inner vectors for this field as defined by the TypeLayout * @deprecated This API will be removed as the current implementations no longer support inner * vectors. - * @return the inner vectors for this field as defined by the TypeLayout */ @Deprecated List getFieldInnerVectors(); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/NullVector.java b/java/vector/src/main/java/org/apache/arrow/vector/NullVector.java index 25e5bdc6f46a3..227ca716f6391 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/NullVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/NullVector.java @@ -224,9 +224,9 @@ public List getFieldBuffers() { /** * Get the inner vectors. * + * @return the inner vectors for this field as defined by the TypeLayout * @deprecated This API will be removed as the current implementations no longer support inner * vectors. - * @return the inner vectors for this field as defined by the TypeLayout */ @Deprecated @Override diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java index 10637304df057..1cdb87eba0376 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java @@ -142,9 +142,9 @@ protected void reallocOffsetBuffer() { /** * Get the offset vector. * + * @return the underlying offset vector or null if none exists. * @deprecated This API will be removed, as the current implementations no longer hold inner * offset vectors. - * @return the underlying offset vector or null if none exists. */ @Override @Deprecated diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java index ef2e7383a9105..b5b32c8032dfe 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java @@ -319,9 +319,9 @@ private void setReaderAndWriterIndex() { /** * Get the inner vectors. * + * @return the inner vectors for this field as defined by the TypeLayout * @deprecated This API will be removed as the current implementations no longer support inner * vectors. - * @return the inner vectors for this field as defined by the TypeLayout */ @Deprecated @Override @@ -494,9 +494,9 @@ public void copyFrom(int inIndex, int outIndex, ValueVector from) { /** * Get the offset vector. * + * @return the underlying offset vector or null if none exists. * @deprecated This API will be removed, as the current implementations no longer hold inner * offset vectors. - * @return the underlying offset vector or null if none exists. */ @Override @Deprecated diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java index 17708167ff4b3..a1e18210fc686 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java @@ -277,9 +277,9 @@ private void setReaderAndWriterIndex() { /** * Get the inner vectors. * + * @return the inner vectors for this field as defined by the TypeLayout * @deprecated This API will be removed as the current implementations no longer support inner * vectors. - * @return the inner vectors for this field as defined by the TypeLayout */ @Deprecated @Override diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedValueVector.java index 16492aec7477a..de7966a0aee2e 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedValueVector.java @@ -35,9 +35,9 @@ public interface RepeatedValueVector extends ValueVector, DensityAwareVector { /** * Get the offset vector. * + * @return the underlying offset vector or null if none exists. * @deprecated This API will be removed, as the current implementations no longer hold inner * offset vectors. - * @return the underlying offset vector or null if none exists. */ @Deprecated UInt4Vector getOffsetVector(); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java index 53f83fdfa7cbe..dda9b6547f758 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java @@ -188,9 +188,9 @@ private void setReaderAndWriterIndex() { /** * Get the inner vectors. * + * @return the inner vectors for this field as defined by the TypeLayout * @deprecated This API will be removed as the current implementations no longer support inner * vectors. - * @return the inner vectors for this field as defined by the TypeLayout */ @Deprecated @Override From 6bba2036a5b3d8fecd15260efa8d6e23b848c5f4 Mon Sep 17 00:00:00 2001 From: Laurent Goujon Date: Tue, 30 Jul 2024 12:29:56 -0700 Subject: [PATCH 46/73] GH-43479: [Java] Change visibility of MemoryUtil.UNSAFE (#43480) ### Rationale for this change `MemoryUtil.UNSAFE` field is a public field which provides unrestricted access to `sun.misc.Unsafe` instance which may cause misusage and possibly JVM crashes. ### What changes are included in this PR? Make the field (and other related fields) private and only allow indirect use of Unsafe through `MemoryUtil` methods ### Are these changes tested? Yes. ### Are there any user-facing changes? No **This PR includes breaking changes to public APIs.** Code using `MemoryUtil.UNSAFE` would break as the visibility of the field was changed to private * GitHub Issue: #43479 Authored-by: Laurent Goujon Signed-off-by: Dane Pitkin --- .../adapter/jdbc/consumer/ClobConsumer.java | 16 +--- .../FixedWidthOutOfPlaceVectorSorter.java | 2 +- .../VariableWidthOutOfPlaceVectorSorter.java | 2 +- .../apache/arrow/c/ArrowArrayUtilityTest.java | 12 +-- .../org/apache/arrow/memory/ArrowBuf.java | 82 +++++++++---------- .../memory/util/ByteFunctionHelpers.java | 42 +++++----- .../apache/arrow/memory/util/MemoryUtil.java | 72 +++++++++++++++- .../arrow/memory/util/hash/MurmurHasher.java | 4 +- .../arrow/memory/util/hash/SimpleHasher.java | 6 +- .../DefaultAllocationManagerFactory.java | 6 +- .../arrow/memory/TestBaseAllocator.java | 25 +----- .../arrow/memory/TestForeignAllocation.java | 4 +- .../unsafe/UnsafeAllocationManager.java | 6 +- .../arrow/vector/BaseFixedWidthVector.java | 2 +- .../apache/arrow/vector/BitVectorHelper.java | 13 ++- .../apache/arrow/vector/Decimal256Vector.java | 44 +++++----- .../apache/arrow/vector/DecimalVector.java | 43 ++++------ .../arrow/vector/util/DecimalUtility.java | 8 +- .../arrow/vector/util/VectorAppender.java | 18 ++-- .../arrow/vector/TestBitVectorHelper.java | 12 +-- 20 files changed, 218 insertions(+), 201 deletions(-) diff --git a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/ClobConsumer.java b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/ClobConsumer.java index 7deba1cbffebd..9fcdd42414dfa 100644 --- a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/ClobConsumer.java +++ b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/consumer/ClobConsumer.java @@ -86,12 +86,8 @@ public void consume(ResultSet resultSet) throws SQLException { while ((dataBuffer.writerIndex() + bytes.length) > dataBuffer.capacity()) { vector.reallocDataBuffer(); } - MemoryUtil.UNSAFE.copyMemory( - bytes, - MemoryUtil.BYTE_ARRAY_BASE_OFFSET, - null, - dataBuffer.memoryAddress() + startIndex + totalBytes, - bytes.length); + MemoryUtil.copyToMemory( + bytes, 0, dataBuffer.memoryAddress() + startIndex + totalBytes, bytes.length); totalBytes += bytes.length; read += readSize; @@ -133,12 +129,8 @@ public void consume(ResultSet resultSet) throws SQLException { while ((dataBuffer.writerIndex() + bytes.length) > dataBuffer.capacity()) { vector.reallocDataBuffer(); } - MemoryUtil.UNSAFE.copyMemory( - bytes, - MemoryUtil.BYTE_ARRAY_BASE_OFFSET, - null, - dataBuffer.memoryAddress() + startIndex + totalBytes, - bytes.length); + MemoryUtil.copyToMemory( + bytes, 0, dataBuffer.memoryAddress() + startIndex + totalBytes, bytes.length); totalBytes += bytes.length; read += readSize; diff --git a/java/algorithm/src/main/java/org/apache/arrow/algorithm/sort/FixedWidthOutOfPlaceVectorSorter.java b/java/algorithm/src/main/java/org/apache/arrow/algorithm/sort/FixedWidthOutOfPlaceVectorSorter.java index 817e890a5abe1..ac8b5a4be56aa 100644 --- a/java/algorithm/src/main/java/org/apache/arrow/algorithm/sort/FixedWidthOutOfPlaceVectorSorter.java +++ b/java/algorithm/src/main/java/org/apache/arrow/algorithm/sort/FixedWidthOutOfPlaceVectorSorter.java @@ -77,7 +77,7 @@ public void sortOutOfPlace(V srcVector, V dstVector, VectorValueComparator co BitVectorHelper.unsetBit(dstValidityBuffer, dstIndex); } else { BitVectorHelper.setBit(dstValidityBuffer, dstIndex); - MemoryUtil.UNSAFE.copyMemory( + MemoryUtil.copyMemory( srcValueBuffer.memoryAddress() + srcIndex * ((long) valueWidth), dstValueBuffer.memoryAddress() + dstIndex * ((long) valueWidth), valueWidth); diff --git a/java/algorithm/src/main/java/org/apache/arrow/algorithm/sort/VariableWidthOutOfPlaceVectorSorter.java b/java/algorithm/src/main/java/org/apache/arrow/algorithm/sort/VariableWidthOutOfPlaceVectorSorter.java index 8f58dc0dcee0f..a3aca83441d2f 100644 --- a/java/algorithm/src/main/java/org/apache/arrow/algorithm/sort/VariableWidthOutOfPlaceVectorSorter.java +++ b/java/algorithm/src/main/java/org/apache/arrow/algorithm/sort/VariableWidthOutOfPlaceVectorSorter.java @@ -91,7 +91,7 @@ public void sortOutOfPlace(V srcVector, V dstVector, VectorValueComparator co int valueLength = srcOffsetBuffer.getInt((srcIndex + 1) * ((long) BaseVariableWidthVector.OFFSET_WIDTH)) - srcOffset; - MemoryUtil.UNSAFE.copyMemory( + MemoryUtil.copyMemory( srcValueBuffer.memoryAddress() + srcOffset, dstValueBuffer.memoryAddress() + dstOffset, valueLength); diff --git a/java/c/src/test/java/org/apache/arrow/c/ArrowArrayUtilityTest.java b/java/c/src/test/java/org/apache/arrow/c/ArrowArrayUtilityTest.java index 46f09ae5f0e8f..1d4cb411fab45 100644 --- a/java/c/src/test/java/org/apache/arrow/c/ArrowArrayUtilityTest.java +++ b/java/c/src/test/java/org/apache/arrow/c/ArrowArrayUtilityTest.java @@ -103,14 +103,14 @@ allocator, dummyHandle, new ArrowFieldNode(/* length= */ 0, 0), new long[] {0})) @Test void cleanupAfterFailure() throws Exception { // Note values are all dummy values here - long address = MemoryUtil.UNSAFE.allocateMemory(16); + long address = MemoryUtil.allocateMemory(16); try (BufferImportTypeVisitor visitor = new BufferImportTypeVisitor( allocator, dummyHandle, new ArrowFieldNode(0, 0), new long[] {address})) { // This fails, but only after we've already imported a buffer. assertThrows(IllegalStateException.class, () -> visitor.visit(new ArrowType.Int(32, true))); } finally { - MemoryUtil.UNSAFE.freeMemory(address); + MemoryUtil.freeMemory(address); } } @@ -119,7 +119,7 @@ void bufferAssociatedWithAllocator() throws Exception { // Note values are all dummy values here final long bufferSize = 16; final long fieldLength = bufferSize / IntVector.TYPE_WIDTH; - long address = MemoryUtil.UNSAFE.allocateMemory(bufferSize); + long address = MemoryUtil.allocateMemory(bufferSize); long baseline = allocator.getAllocatedMemory(); ArrowFieldNode fieldNode = new ArrowFieldNode(fieldLength, 0); try (BufferImportTypeVisitor visitor = @@ -134,7 +134,7 @@ void bufferAssociatedWithAllocator() throws Exception { .isEqualTo(allocator); assertThat(allocator.getAllocatedMemory()).isEqualTo(baseline + bufferSize); } finally { - MemoryUtil.UNSAFE.freeMemory(address); + MemoryUtil.freeMemory(address); } assertThat(allocator.getAllocatedMemory()).isEqualTo(baseline); } @@ -161,7 +161,7 @@ void releaseRetain() { @Test void associate() { final long bufferSize = 16; - final long address = MemoryUtil.UNSAFE.allocateMemory(bufferSize); + final long address = MemoryUtil.allocateMemory(bufferSize); try { ArrowArray array = ArrowArray.allocateNew(allocator); ReferenceCountedArrowArray handle = new ReferenceCountedArrowArray(array); @@ -173,7 +173,7 @@ void associate() { handle.release(); assertThat(array.isClosed()).isTrue(); } finally { - MemoryUtil.UNSAFE.freeMemory(address); + MemoryUtil.freeMemory(address); } } } diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ArrowBuf.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ArrowBuf.java index 24a0ea0761ec0..a958092a5789a 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ArrowBuf.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ArrowBuf.java @@ -310,7 +310,7 @@ private void checkIndexD(long index, long fieldLength) { */ public long getLong(long index) { chk(index, LONG_SIZE); - return MemoryUtil.UNSAFE.getLong(addr(index)); + return MemoryUtil.getLong(addr(index)); } /** @@ -322,7 +322,7 @@ public long getLong(long index) { */ public void setLong(long index, long value) { chk(index, LONG_SIZE); - MemoryUtil.UNSAFE.putLong(addr(index), value); + MemoryUtil.putLong(addr(index), value); } /** @@ -345,7 +345,7 @@ public float getFloat(long index) { */ public void setFloat(long index, float value) { chk(index, FLOAT_SIZE); - MemoryUtil.UNSAFE.putInt(addr(index), Float.floatToRawIntBits(value)); + MemoryUtil.putInt(addr(index), Float.floatToRawIntBits(value)); } /** @@ -368,7 +368,7 @@ public double getDouble(long index) { */ public void setDouble(long index, double value) { chk(index, DOUBLE_SIZE); - MemoryUtil.UNSAFE.putLong(addr(index), Double.doubleToRawLongBits(value)); + MemoryUtil.putLong(addr(index), Double.doubleToRawLongBits(value)); } /** @@ -391,7 +391,7 @@ public char getChar(long index) { */ public void setChar(long index, int value) { chk(index, SHORT_SIZE); - MemoryUtil.UNSAFE.putShort(addr(index), (short) value); + MemoryUtil.putShort(addr(index), (short) value); } /** @@ -403,7 +403,7 @@ public void setChar(long index, int value) { */ public int getInt(long index) { chk(index, INT_SIZE); - return MemoryUtil.UNSAFE.getInt(addr(index)); + return MemoryUtil.getInt(addr(index)); } /** @@ -414,7 +414,7 @@ public int getInt(long index) { */ public void setInt(long index, int value) { chk(index, INT_SIZE); - MemoryUtil.UNSAFE.putInt(addr(index), value); + MemoryUtil.putInt(addr(index), value); } /** @@ -426,7 +426,7 @@ public void setInt(long index, int value) { */ public short getShort(long index) { chk(index, SHORT_SIZE); - return MemoryUtil.UNSAFE.getShort(addr(index)); + return MemoryUtil.getShort(addr(index)); } /** @@ -449,7 +449,7 @@ public void setShort(long index, int value) { */ public void setShort(long index, short value) { chk(index, SHORT_SIZE); - MemoryUtil.UNSAFE.putShort(addr(index), value); + MemoryUtil.putShort(addr(index), value); } /** @@ -461,7 +461,7 @@ public void setShort(long index, short value) { */ public void setByte(long index, int value) { chk(index, 1); - MemoryUtil.UNSAFE.putByte(addr(index), (byte) value); + MemoryUtil.putByte(addr(index), (byte) value); } /** @@ -473,7 +473,7 @@ public void setByte(long index, int value) { */ public void setByte(long index, byte value) { chk(index, 1); - MemoryUtil.UNSAFE.putByte(addr(index), value); + MemoryUtil.putByte(addr(index), value); } /** @@ -485,7 +485,7 @@ public void setByte(long index, byte value) { */ public byte getByte(long index) { chk(index, 1); - return MemoryUtil.UNSAFE.getByte(addr(index)); + return MemoryUtil.getByte(addr(index)); } /*--------------------------------------------------* @@ -566,7 +566,7 @@ public void readBytes(byte[] dst) { */ public void writeByte(byte value) { ensureWritable(1); - MemoryUtil.UNSAFE.putByte(addr(writerIndex), value); + MemoryUtil.putByte(addr(writerIndex), value); ++writerIndex; } @@ -577,7 +577,7 @@ public void writeByte(byte value) { */ public void writeByte(int value) { ensureWritable(1); - MemoryUtil.UNSAFE.putByte(addr(writerIndex), (byte) value); + MemoryUtil.putByte(addr(writerIndex), (byte) value); ++writerIndex; } @@ -612,7 +612,7 @@ public void writeBytes(byte[] src, int srcIndex, int length) { */ public void writeShort(int value) { ensureWritable(SHORT_SIZE); - MemoryUtil.UNSAFE.putShort(addr(writerIndex), (short) value); + MemoryUtil.putShort(addr(writerIndex), (short) value); writerIndex += SHORT_SIZE; } @@ -623,7 +623,7 @@ public void writeShort(int value) { */ public void writeInt(int value) { ensureWritable(INT_SIZE); - MemoryUtil.UNSAFE.putInt(addr(writerIndex), value); + MemoryUtil.putInt(addr(writerIndex), value); writerIndex += INT_SIZE; } @@ -634,7 +634,7 @@ public void writeInt(int value) { */ public void writeLong(long value) { ensureWritable(LONG_SIZE); - MemoryUtil.UNSAFE.putLong(addr(writerIndex), value); + MemoryUtil.putLong(addr(writerIndex), value); writerIndex += LONG_SIZE; } @@ -645,7 +645,7 @@ public void writeLong(long value) { */ public void writeFloat(float value) { ensureWritable(FLOAT_SIZE); - MemoryUtil.UNSAFE.putInt(addr(writerIndex), Float.floatToRawIntBits(value)); + MemoryUtil.putInt(addr(writerIndex), Float.floatToRawIntBits(value)); writerIndex += FLOAT_SIZE; } @@ -656,7 +656,7 @@ public void writeFloat(float value) { */ public void writeDouble(double value) { ensureWritable(DOUBLE_SIZE); - MemoryUtil.UNSAFE.putLong(addr(writerIndex), Double.doubleToRawLongBits(value)); + MemoryUtil.putLong(addr(writerIndex), Double.doubleToRawLongBits(value)); writerIndex += DOUBLE_SIZE; } @@ -727,8 +727,7 @@ public void getBytes(long index, byte[] dst, int dstIndex, int length) { if (length != 0) { // copy "length" bytes from this ArrowBuf starting at addr(index) address // into dst byte array at dstIndex onwards - MemoryUtil.copyMemory( - null, addr(index), dst, MemoryUtil.BYTE_ARRAY_BASE_OFFSET + dstIndex, length); + MemoryUtil.copyFromMemory(addr(index), dst, dstIndex, length); } } @@ -766,8 +765,7 @@ public void setBytes(long index, byte[] src, int srcIndex, long length) { if (length > 0) { // copy "length" bytes from src byte array at the starting index (srcIndex) // into this ArrowBuf starting at address "addr(index)" - MemoryUtil.copyMemory( - src, MemoryUtil.BYTE_ARRAY_BASE_OFFSET + srcIndex, null, addr(index), length); + MemoryUtil.copyToMemory(src, srcIndex, addr(index), length); } } @@ -792,7 +790,7 @@ public void getBytes(long index, ByteBuffer dst) { // at address srcAddress into the dst ByteBuffer starting at // address dstAddress final long dstAddress = MemoryUtil.getByteBufferAddress(dst) + dst.position(); - MemoryUtil.copyMemory(null, srcAddress, null, dstAddress, dst.remaining()); + MemoryUtil.copyMemory(srcAddress, dstAddress, dst.remaining()); // after copy, bump the next write position for the dst ByteBuffer dst.position(dst.position() + dst.remaining()); } else if (dst.hasArray()) { @@ -800,12 +798,7 @@ public void getBytes(long index, ByteBuffer dst) { // at address srcAddress into the dst ByteBuffer starting at // index dstIndex final int dstIndex = dst.arrayOffset() + dst.position(); - MemoryUtil.copyMemory( - null, - srcAddress, - dst.array(), - MemoryUtil.BYTE_ARRAY_BASE_OFFSET + dstIndex, - dst.remaining()); + MemoryUtil.copyFromMemory(srcAddress, dst.array(), dstIndex, dst.remaining()); // after copy, bump the next write position for the dst ByteBuffer dst.position(dst.position() + dst.remaining()); } else { @@ -834,15 +827,14 @@ public void setBytes(long index, ByteBuffer src) { // copy src.remaining() bytes of data from src ByteBuffer starting at // address srcAddress into this ArrowBuf starting at address dstAddress final long srcAddress = MemoryUtil.getByteBufferAddress(src) + src.position(); - MemoryUtil.copyMemory(null, srcAddress, null, dstAddress, length); + MemoryUtil.copyMemory(srcAddress, dstAddress, length); // after copy, bump the next read position for the src ByteBuffer src.position(src.position() + length); } else if (src.hasArray()) { // copy src.remaining() bytes of data from src ByteBuffer starting at // index srcIndex into this ArrowBuf starting at address dstAddress final int srcIndex = src.arrayOffset() + src.position(); - MemoryUtil.copyMemory( - src.array(), MemoryUtil.BYTE_ARRAY_BASE_OFFSET + srcIndex, null, dstAddress, length); + MemoryUtil.copyToMemory(src.array(), srcIndex, dstAddress, length); // after copy, bump the next read position for the src ByteBuffer src.position(src.position() + length); } else { @@ -852,19 +844,19 @@ public void setBytes(long index, ByteBuffer src) { // copy word at a time while (length - 128 >= LONG_SIZE) { for (int x = 0; x < 16; x++) { - MemoryUtil.UNSAFE.putLong(dstAddress, src.getLong()); + MemoryUtil.putLong(dstAddress, src.getLong()); length -= LONG_SIZE; dstAddress += LONG_SIZE; } } while (length >= LONG_SIZE) { - MemoryUtil.UNSAFE.putLong(dstAddress, src.getLong()); + MemoryUtil.putLong(dstAddress, src.getLong()); length -= LONG_SIZE; dstAddress += LONG_SIZE; } // copy last byte while (length > 0) { - MemoryUtil.UNSAFE.putByte(dstAddress, src.get()); + MemoryUtil.putByte(dstAddress, src.get()); --length; ++dstAddress; } @@ -892,7 +884,7 @@ public void setBytes(long index, ByteBuffer src, int srcIndex, int length) { // srcAddress into this ArrowBuf at address dstAddress final long srcAddress = MemoryUtil.getByteBufferAddress(src) + srcIndex; final long dstAddress = addr(index); - MemoryUtil.copyMemory(null, srcAddress, null, dstAddress, length); + MemoryUtil.copyMemory(srcAddress, dstAddress, length); } else { if (srcIndex == 0 && src.capacity() == length) { // copy the entire ByteBuffer from start to end of length @@ -932,7 +924,7 @@ public void getBytes(long index, ArrowBuf dst, long dstIndex, int length) { // dstAddress final long srcAddress = addr(index); final long dstAddress = dst.memoryAddress() + (long) dstIndex; - MemoryUtil.copyMemory(null, srcAddress, null, dstAddress, length); + MemoryUtil.copyMemory(srcAddress, dstAddress, length); } } @@ -962,7 +954,7 @@ public void setBytes(long index, ArrowBuf src, long srcIndex, long length) { // dstAddress final long srcAddress = src.memoryAddress() + srcIndex; final long dstAddress = addr(index); - MemoryUtil.copyMemory(null, srcAddress, null, dstAddress, length); + MemoryUtil.copyMemory(srcAddress, dstAddress, length); } } @@ -982,7 +974,7 @@ public void setBytes(long index, ArrowBuf src) { checkIndex(index, length); final long srcAddress = src.memoryAddress() + src.readerIndex; final long dstAddress = addr(index); - MemoryUtil.copyMemory(null, srcAddress, null, dstAddress, length); + MemoryUtil.copyMemory(srcAddress, dstAddress, length); src.readerIndex(src.readerIndex + length); } @@ -1007,7 +999,7 @@ public int setBytes(long index, InputStream in, int length) throws IOException { if (readBytes > 0) { // copy readBytes length of data from the tmp byte array starting // at srcIndex 0 into this ArrowBuf starting at address addr(index) - MemoryUtil.copyMemory(tmp, MemoryUtil.BYTE_ARRAY_BASE_OFFSET, null, addr(index), readBytes); + MemoryUtil.copyToMemory(tmp, 0, addr(index), readBytes); } } return readBytes; @@ -1029,7 +1021,7 @@ public void getBytes(long index, OutputStream out, int length) throws IOExceptio // copy length bytes of data from this ArrowBuf starting at // address addr(index) into the tmp byte array starting at index 0 byte[] tmp = new byte[length]; - MemoryUtil.copyMemory(null, addr(index), tmp, MemoryUtil.BYTE_ARRAY_BASE_OFFSET, length); + MemoryUtil.copyFromMemory(addr(index), tmp, 0, length); // write the copied data to output stream out.write(tmp); } @@ -1173,7 +1165,7 @@ public ArrowBuf writerIndex(long writerIndex) { public ArrowBuf setZero(long index, long length) { if (length != 0) { this.checkIndex(index, length); - MemoryUtil.UNSAFE.setMemory(this.addr + index, length, (byte) 0); + MemoryUtil.setMemory(this.addr + index, length, (byte) 0); } return this; } @@ -1191,7 +1183,7 @@ public ArrowBuf setZero(long index, long length) { public ArrowBuf setOne(int index, int length) { if (length != 0) { this.checkIndex(index, length); - MemoryUtil.UNSAFE.setMemory(this.addr + index, length, (byte) 0xff); + MemoryUtil.setMemory(this.addr + index, length, (byte) 0xff); } return this; } @@ -1207,7 +1199,7 @@ public ArrowBuf setOne(int index, int length) { public ArrowBuf setOne(long index, long length) { if (length != 0) { this.checkIndex(index, length); - MemoryUtil.UNSAFE.setMemory(this.addr + index, length, (byte) 0xff); + MemoryUtil.setMemory(this.addr + index, length, (byte) 0xff); } return this; } diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/ByteFunctionHelpers.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/ByteFunctionHelpers.java index 44289183a318d..9243be399b6db 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/ByteFunctionHelpers.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/ByteFunctionHelpers.java @@ -61,8 +61,8 @@ private static int memEqual( while (n > 63) { for (int x = 0; x < 8; x++) { - long leftLong = MemoryUtil.UNSAFE.getLong(lPos); - long rightLong = MemoryUtil.UNSAFE.getLong(rPos); + long leftLong = MemoryUtil.getLong(lPos); + long rightLong = MemoryUtil.getLong(rPos); if (leftLong != rightLong) { return 0; } @@ -73,8 +73,8 @@ private static int memEqual( } while (n > 7) { - long leftLong = MemoryUtil.UNSAFE.getLong(lPos); - long rightLong = MemoryUtil.UNSAFE.getLong(rPos); + long leftLong = MemoryUtil.getLong(lPos); + long rightLong = MemoryUtil.getLong(rPos); if (leftLong != rightLong) { return 0; } @@ -84,8 +84,8 @@ private static int memEqual( } if (n > 3) { - int leftInt = MemoryUtil.UNSAFE.getInt(lPos); - int rightInt = MemoryUtil.UNSAFE.getInt(rPos); + int leftInt = MemoryUtil.getInt(lPos); + int rightInt = MemoryUtil.getInt(rPos); if (leftInt != rightInt) { return 0; } @@ -95,8 +95,8 @@ private static int memEqual( } while (n-- != 0) { - byte leftByte = MemoryUtil.UNSAFE.getByte(lPos); - byte rightByte = MemoryUtil.UNSAFE.getByte(rPos); + byte leftByte = MemoryUtil.getByte(lPos); + byte rightByte = MemoryUtil.getByte(rPos); if (leftByte != rightByte) { return 0; } @@ -141,8 +141,8 @@ private static int memcmp( while (n > 63) { for (int x = 0; x < 8; x++) { - long leftLong = MemoryUtil.UNSAFE.getLong(lPos); - long rightLong = MemoryUtil.UNSAFE.getLong(rPos); + long leftLong = MemoryUtil.getLong(lPos); + long rightLong = MemoryUtil.getLong(rPos); if (leftLong != rightLong) { if (LITTLE_ENDIAN) { return unsignedLongCompare(Long.reverseBytes(leftLong), Long.reverseBytes(rightLong)); @@ -157,8 +157,8 @@ private static int memcmp( } while (n > 7) { - long leftLong = MemoryUtil.UNSAFE.getLong(lPos); - long rightLong = MemoryUtil.UNSAFE.getLong(rPos); + long leftLong = MemoryUtil.getLong(lPos); + long rightLong = MemoryUtil.getLong(rPos); if (leftLong != rightLong) { if (LITTLE_ENDIAN) { return unsignedLongCompare(Long.reverseBytes(leftLong), Long.reverseBytes(rightLong)); @@ -172,8 +172,8 @@ private static int memcmp( } if (n > 3) { - int leftInt = MemoryUtil.UNSAFE.getInt(lPos); - int rightInt = MemoryUtil.UNSAFE.getInt(rPos); + int leftInt = MemoryUtil.getInt(lPos); + int rightInt = MemoryUtil.getInt(rPos); if (leftInt != rightInt) { if (LITTLE_ENDIAN) { return unsignedIntCompare(Integer.reverseBytes(leftInt), Integer.reverseBytes(rightInt)); @@ -187,8 +187,8 @@ private static int memcmp( } while (n-- != 0) { - byte leftByte = MemoryUtil.UNSAFE.getByte(lPos); - byte rightByte = MemoryUtil.UNSAFE.getByte(rPos); + byte leftByte = MemoryUtil.getByte(lPos); + byte rightByte = MemoryUtil.getByte(rPos); if (leftByte != rightByte) { return ((leftByte & 0xFF) - (rightByte & 0xFF)) > 0 ? 1 : -1; } @@ -248,8 +248,8 @@ private static int memcmp( int rPos = rStart; while (n > 7) { - long leftLong = MemoryUtil.UNSAFE.getLong(lPos); - long rightLong = MemoryUtil.UNSAFE.getLong(right, MemoryUtil.BYTE_ARRAY_BASE_OFFSET + rPos); + long leftLong = MemoryUtil.getLong(lPos); + long rightLong = MemoryUtil.getLong(right, rPos); if (leftLong != rightLong) { if (LITTLE_ENDIAN) { return unsignedLongCompare(Long.reverseBytes(leftLong), Long.reverseBytes(rightLong)); @@ -263,8 +263,8 @@ private static int memcmp( } if (n > 3) { - int leftInt = MemoryUtil.UNSAFE.getInt(lPos); - int rightInt = MemoryUtil.UNSAFE.getInt(right, MemoryUtil.BYTE_ARRAY_BASE_OFFSET + rPos); + int leftInt = MemoryUtil.getInt(lPos); + int rightInt = MemoryUtil.getInt(right, rPos); if (leftInt != rightInt) { if (LITTLE_ENDIAN) { return unsignedIntCompare(Integer.reverseBytes(leftInt), Integer.reverseBytes(rightInt)); @@ -278,7 +278,7 @@ private static int memcmp( } while (n-- != 0) { - byte leftByte = MemoryUtil.UNSAFE.getByte(lPos); + byte leftByte = MemoryUtil.getByte(lPos); byte rightByte = right[rPos]; if (leftByte != rightByte) { return ((leftByte & 0xFF) - (rightByte & 0xFF)) > 0 ? 1 : -1; diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/MemoryUtil.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/MemoryUtil.java index c1b44c3932659..acf77547fbcdd 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/MemoryUtil.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/MemoryUtil.java @@ -33,13 +33,13 @@ public class MemoryUtil { private static final @Nullable Constructor DIRECT_BUFFER_CONSTRUCTOR; /** The unsafe object from which to access the off-heap memory. */ - public static final Unsafe UNSAFE; + private static final Unsafe UNSAFE; /** The start offset of array data relative to the start address of the array object. */ - public static final long BYTE_ARRAY_BASE_OFFSET; + private static final long BYTE_ARRAY_BASE_OFFSET; /** The offset of the address field with the {@link java.nio.ByteBuffer} object. */ - static final long BYTE_BUFFER_ADDRESS_OFFSET; + private static final long BYTE_BUFFER_ADDRESS_OFFSET; /** If the native byte order is little-endian. */ public static final boolean LITTLE_ENDIAN = ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN; @@ -178,7 +178,7 @@ public static ByteBuffer directBuffer(long address, int capacity) { @SuppressWarnings( "nullness:argument") // to handle null assignment on third party dependency: Unsafe - public static void copyMemory( + private static void copyMemory( @Nullable Object srcBase, long srcOffset, @Nullable Object destBase, @@ -186,4 +186,68 @@ public static void copyMemory( long bytes) { UNSAFE.copyMemory(srcBase, srcOffset, destBase, destOffset, bytes); } + + public static void copyMemory(long srcAddress, long destAddress, long bytes) { + UNSAFE.copyMemory(srcAddress, destAddress, bytes); + } + + public static void copyToMemory(byte[] src, long srcIndex, long destAddress, long bytes) { + copyMemory(src, BYTE_ARRAY_BASE_OFFSET + srcIndex, null, destAddress, bytes); + } + + public static void copyFromMemory(long srcAddress, byte[] dest, long destIndex, long bytes) { + copyMemory(null, srcAddress, dest, BYTE_ARRAY_BASE_OFFSET + destIndex, bytes); + } + + public static byte getByte(long address) { + return UNSAFE.getByte(address); + } + + public static void putByte(long address, byte value) { + UNSAFE.putByte(address, value); + } + + public static short getShort(long address) { + return UNSAFE.getShort(address); + } + + public static void putShort(long address, short value) { + UNSAFE.putShort(address, value); + } + + public static int getInt(long address) { + return UNSAFE.getInt(address); + } + + public static void putInt(long address, int value) { + UNSAFE.putInt(address, value); + } + + public static long getLong(long address) { + return UNSAFE.getLong(address); + } + + public static void putLong(long address, long value) { + UNSAFE.putLong(address, value); + } + + public static void setMemory(long address, long bytes, byte value) { + UNSAFE.setMemory(address, bytes, value); + } + + public static int getInt(byte[] bytes, int index) { + return UNSAFE.getInt(bytes, BYTE_ARRAY_BASE_OFFSET + index); + } + + public static long getLong(byte[] bytes, int index) { + return UNSAFE.getLong(bytes, BYTE_ARRAY_BASE_OFFSET + index); + } + + public static long allocateMemory(long bytes) { + return UNSAFE.allocateMemory(bytes); + } + + public static void freeMemory(long address) { + UNSAFE.freeMemory(address); + } } diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/hash/MurmurHasher.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/hash/MurmurHasher.java index eaf4a833c4eeb..7907018d0a815 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/hash/MurmurHasher.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/hash/MurmurHasher.java @@ -86,7 +86,7 @@ public static int hashCode(long address, long length, int seed) { int index = 0; int hash = seed; while (index + 4 <= length) { - int intValue = MemoryUtil.UNSAFE.getInt(address + index); + int intValue = MemoryUtil.getInt(address + index); hash = combineHashCode(hash, intValue); index += 4; } @@ -96,7 +96,7 @@ public static int hashCode(long address, long length, int seed) { int intValue = 0; for (long i = length - 1; i >= index; i--) { intValue <<= 8; - intValue |= (MemoryUtil.UNSAFE.getByte(address + i) & 0x000000ff); + intValue |= (MemoryUtil.getByte(address + i) & 0x000000ff); index += 1; } hash = combineHashCode(hash, intValue); diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/hash/SimpleHasher.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/hash/SimpleHasher.java index b9987a5ecb049..5c1384163e81e 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/hash/SimpleHasher.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/hash/SimpleHasher.java @@ -52,21 +52,21 @@ public int hashCode(long address, long length) { int hashValue = 0; int index = 0; while (index + 8 <= length) { - long longValue = MemoryUtil.UNSAFE.getLong(address + index); + long longValue = MemoryUtil.getLong(address + index); int longHash = getLongHashCode(longValue); hashValue = combineHashCode(hashValue, longHash); index += 8; } if (index + 4 <= length) { - int intValue = MemoryUtil.UNSAFE.getInt(address + index); + int intValue = MemoryUtil.getInt(address + index); int intHash = intValue; hashValue = combineHashCode(hashValue, intHash); index += 4; } while (index < length) { - byte byteValue = MemoryUtil.UNSAFE.getByte(address + index); + byte byteValue = MemoryUtil.getByte(address + index); int byteHash = byteValue; hashValue = combineHashCode(hashValue, byteHash); index += 1; diff --git a/java/memory/memory-core/src/test/java/org/apache/arrow/memory/DefaultAllocationManagerFactory.java b/java/memory/memory-core/src/test/java/org/apache/arrow/memory/DefaultAllocationManagerFactory.java index 83118face8674..348ed3e7933b0 100644 --- a/java/memory/memory-core/src/test/java/org/apache/arrow/memory/DefaultAllocationManagerFactory.java +++ b/java/memory/memory-core/src/test/java/org/apache/arrow/memory/DefaultAllocationManagerFactory.java @@ -27,13 +27,13 @@ public class DefaultAllocationManagerFactory implements AllocationManager.Factor public static final AllocationManager.Factory FACTORY = new DefaultAllocationManagerFactory(); private static final ArrowBuf EMPTY = - new ArrowBuf(ReferenceManager.NO_OP, null, 0, MemoryUtil.UNSAFE.allocateMemory(0)); + new ArrowBuf(ReferenceManager.NO_OP, null, 0, MemoryUtil.allocateMemory(0)); @Override public AllocationManager create(BufferAllocator accountingAllocator, long size) { return new AllocationManager(accountingAllocator) { private final long allocatedSize = size; - private final long address = MemoryUtil.UNSAFE.allocateMemory(size); + private final long address = MemoryUtil.allocateMemory(size); @Override public long getSize() { @@ -47,7 +47,7 @@ protected long memoryAddress() { @Override protected void release0() { - MemoryUtil.UNSAFE.freeMemory(address); + MemoryUtil.freeMemory(address); } }; } diff --git a/java/memory/memory-core/src/test/java/org/apache/arrow/memory/TestBaseAllocator.java b/java/memory/memory-core/src/test/java/org/apache/arrow/memory/TestBaseAllocator.java index d7d7fde00ac63..a5fbc67c48f5c 100644 --- a/java/memory/memory-core/src/test/java/org/apache/arrow/memory/TestBaseAllocator.java +++ b/java/memory/memory-core/src/test/java/org/apache/arrow/memory/TestBaseAllocator.java @@ -25,7 +25,6 @@ import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.fail; -import java.lang.reflect.Field; import java.util.Arrays; import java.util.Collection; import java.util.Collections; @@ -34,9 +33,9 @@ import org.apache.arrow.memory.rounding.RoundingPolicy; import org.apache.arrow.memory.rounding.SegmentRoundingPolicy; import org.apache.arrow.memory.util.AssertionUtil; +import org.apache.arrow.memory.util.MemoryUtil; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; -import sun.misc.Unsafe; public class TestBaseAllocator { @@ -405,8 +404,7 @@ private BaseAllocator createAllocatorWithCustomizedAllocationManager() { public AllocationManager create( BufferAllocator accountingAllocator, long requestedSize) { return new AllocationManager(accountingAllocator) { - private final Unsafe unsafe = getUnsafe(); - private final long address = unsafe.allocateMemory(requestedSize); + private final long address = MemoryUtil.allocateMemory(requestedSize); @Override protected long memoryAddress() { @@ -415,29 +413,14 @@ protected long memoryAddress() { @Override protected void release0() { - unsafe.setMemory(address, requestedSize, (byte) 0); - unsafe.freeMemory(address); + MemoryUtil.setMemory(address, requestedSize, (byte) 0); + MemoryUtil.freeMemory(address); } @Override public long getSize() { return requestedSize; } - - private Unsafe getUnsafe() { - Field f = null; - try { - f = Unsafe.class.getDeclaredField("theUnsafe"); - f.setAccessible(true); - return (Unsafe) f.get(null); - } catch (NoSuchFieldException | IllegalAccessException e) { - throw new RuntimeException(e); - } finally { - if (f != null) { - f.setAccessible(false); - } - } - } }; } diff --git a/java/memory/memory-core/src/test/java/org/apache/arrow/memory/TestForeignAllocation.java b/java/memory/memory-core/src/test/java/org/apache/arrow/memory/TestForeignAllocation.java index 162bbbcbe939c..b19453df5e109 100644 --- a/java/memory/memory-core/src/test/java/org/apache/arrow/memory/TestForeignAllocation.java +++ b/java/memory/memory-core/src/test/java/org/apache/arrow/memory/TestForeignAllocation.java @@ -139,13 +139,13 @@ private static class UnsafeForeignAllocation extends ForeignAllocation { boolean released = false; public UnsafeForeignAllocation(long bufferSize) { - super(bufferSize, MemoryUtil.UNSAFE.allocateMemory(bufferSize)); + super(bufferSize, MemoryUtil.allocateMemory(bufferSize)); } @Override protected void release0() { if (!released) { - MemoryUtil.UNSAFE.freeMemory(memoryAddress()); + MemoryUtil.freeMemory(memoryAddress()); released = true; } } diff --git a/java/memory/memory-unsafe/src/main/java/org/apache/arrow/memory/unsafe/UnsafeAllocationManager.java b/java/memory/memory-unsafe/src/main/java/org/apache/arrow/memory/unsafe/UnsafeAllocationManager.java index 31af262a5720e..67d7e0d2af7cb 100644 --- a/java/memory/memory-unsafe/src/main/java/org/apache/arrow/memory/unsafe/UnsafeAllocationManager.java +++ b/java/memory/memory-unsafe/src/main/java/org/apache/arrow/memory/unsafe/UnsafeAllocationManager.java @@ -26,7 +26,7 @@ public final class UnsafeAllocationManager extends AllocationManager { private static final ArrowBuf EMPTY = - new ArrowBuf(ReferenceManager.NO_OP, null, 0, MemoryUtil.UNSAFE.allocateMemory(0)); + new ArrowBuf(ReferenceManager.NO_OP, null, 0, MemoryUtil.allocateMemory(0)); public static final AllocationManager.Factory FACTORY = new Factory() { @@ -47,7 +47,7 @@ public ArrowBuf empty() { UnsafeAllocationManager(BufferAllocator accountingAllocator, long requestedSize) { super(accountingAllocator); - allocatedAddress = MemoryUtil.UNSAFE.allocateMemory(requestedSize); + allocatedAddress = MemoryUtil.allocateMemory(requestedSize); allocatedSize = requestedSize; } @@ -63,6 +63,6 @@ protected long memoryAddress() { @Override protected void release0() { - MemoryUtil.UNSAFE.freeMemory(allocatedAddress); + MemoryUtil.freeMemory(allocatedAddress); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java index 387033f0d2f7b..4be55396b7492 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java @@ -888,7 +888,7 @@ public void copyFrom(int fromIndex, int thisIndex, ValueVector from) { BitVectorHelper.unsetBit(this.getValidityBuffer(), thisIndex); } else { BitVectorHelper.setBit(this.getValidityBuffer(), thisIndex); - MemoryUtil.UNSAFE.copyMemory( + MemoryUtil.copyMemory( from.getDataBuffer().memoryAddress() + (long) fromIndex * typeWidth, this.getDataBuffer().memoryAddress() + (long) thisIndex * typeWidth, typeWidth); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BitVectorHelper.java b/java/vector/src/main/java/org/apache/arrow/vector/BitVectorHelper.java index cb7ef62013de5..0ac56691a6f6c 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BitVectorHelper.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BitVectorHelper.java @@ -247,7 +247,7 @@ public static boolean checkAllBitsEqualTo( int index = 0; while (index + 8 <= fullBytesCount) { - long longValue = MemoryUtil.UNSAFE.getLong(validityBuffer.memoryAddress() + index); + long longValue = MemoryUtil.getLong(validityBuffer.memoryAddress() + index); if (longValue != (long) intToCompare) { return false; } @@ -255,7 +255,7 @@ public static boolean checkAllBitsEqualTo( } if (index + 4 <= fullBytesCount) { - int intValue = MemoryUtil.UNSAFE.getInt(validityBuffer.memoryAddress() + index); + int intValue = MemoryUtil.getInt(validityBuffer.memoryAddress() + index); if (intValue != intToCompare) { return false; } @@ -263,7 +263,7 @@ public static boolean checkAllBitsEqualTo( } while (index < fullBytesCount) { - byte byteValue = MemoryUtil.UNSAFE.getByte(validityBuffer.memoryAddress() + index); + byte byteValue = MemoryUtil.getByte(validityBuffer.memoryAddress() + index); if (byteValue != (byte) intToCompare) { return false; } @@ -272,7 +272,7 @@ public static boolean checkAllBitsEqualTo( // handling with the last bits if (remainder != 0) { - byte byteValue = MemoryUtil.UNSAFE.getByte(validityBuffer.memoryAddress() + sizeInBytes - 1); + byte byteValue = MemoryUtil.getByte(validityBuffer.memoryAddress() + sizeInBytes - 1); byte mask = (byte) ((1 << remainder) - 1); byteValue = (byte) (byteValue & mask); if (checkOneBits) { @@ -386,7 +386,7 @@ public static void concatBits( // copy the first bit set if (input1 != output) { - MemoryUtil.UNSAFE.copyMemory(input1.memoryAddress(), output.memoryAddress(), numBytes1); + MemoryUtil.copyMemory(input1.memoryAddress(), output.memoryAddress(), numBytes1); } if (bitIndex(numBits1) == 0) { @@ -394,8 +394,7 @@ public static void concatBits( // boundary. // For this case, we have a shortcut to copy all bytes from the second set after the byte // boundary. - MemoryUtil.UNSAFE.copyMemory( - input2.memoryAddress(), output.memoryAddress() + numBytes1, numBytes2); + MemoryUtil.copyMemory(input2.memoryAddress(), output.memoryAddress() + numBytes1, numBytes2); return; } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/Decimal256Vector.java b/java/vector/src/main/java/org/apache/arrow/vector/Decimal256Vector.java index 8774956522aef..42ad741c85f8b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/Decimal256Vector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/Decimal256Vector.java @@ -213,13 +213,13 @@ public void setBigEndian(int index, byte[] value) { long outAddress = valueBuffer.memoryAddress() + (long) index * TYPE_WIDTH; if (length == 0) { - MemoryUtil.UNSAFE.setMemory(outAddress, Decimal256Vector.TYPE_WIDTH, (byte) 0); + MemoryUtil.setMemory(outAddress, Decimal256Vector.TYPE_WIDTH, (byte) 0); return; } if (LITTLE_ENDIAN) { // swap bytes to convert BE to LE for (int byteIdx = 0; byteIdx < length; ++byteIdx) { - MemoryUtil.UNSAFE.putByte(outAddress + byteIdx, value[length - 1 - byteIdx]); + MemoryUtil.putByte(outAddress + byteIdx, value[length - 1 - byteIdx]); } if (length == TYPE_WIDTH) { @@ -229,21 +229,17 @@ public void setBigEndian(int index, byte[] value) { if (length < TYPE_WIDTH) { // sign extend final byte pad = (byte) (value[0] < 0 ? 0xFF : 0x00); - MemoryUtil.UNSAFE.setMemory(outAddress + length, Decimal256Vector.TYPE_WIDTH - length, pad); + MemoryUtil.setMemory(outAddress + length, Decimal256Vector.TYPE_WIDTH - length, pad); return; } } else { if (length <= TYPE_WIDTH) { // copy data from value to outAddress - MemoryUtil.UNSAFE.copyMemory( - value, - MemoryUtil.BYTE_ARRAY_BASE_OFFSET, - null, - outAddress + Decimal256Vector.TYPE_WIDTH - length, - length); + MemoryUtil.copyToMemory( + value, 0, outAddress + Decimal256Vector.TYPE_WIDTH - length, length); // sign extend final byte pad = (byte) (value[0] < 0 ? 0xFF : 0x00); - MemoryUtil.UNSAFE.setMemory(outAddress, Decimal256Vector.TYPE_WIDTH - length, pad); + MemoryUtil.setMemory(outAddress, Decimal256Vector.TYPE_WIDTH - length, pad); return; } } @@ -282,21 +278,20 @@ public void setSafe(int index, long start, ArrowBuf buffer, int length) { long inAddress = buffer.memoryAddress() + start; long outAddress = valueBuffer.memoryAddress() + (long) index * TYPE_WIDTH; if (LITTLE_ENDIAN) { - MemoryUtil.UNSAFE.copyMemory(inAddress, outAddress, length); + MemoryUtil.copyMemory(inAddress, outAddress, length); // sign extend if (length < TYPE_WIDTH) { - byte msb = MemoryUtil.UNSAFE.getByte(inAddress + length - 1); + byte msb = MemoryUtil.getByte(inAddress + length - 1); final byte pad = (byte) (msb < 0 ? 0xFF : 0x00); - MemoryUtil.UNSAFE.setMemory(outAddress + length, Decimal256Vector.TYPE_WIDTH - length, pad); + MemoryUtil.setMemory(outAddress + length, Decimal256Vector.TYPE_WIDTH - length, pad); } } else { - MemoryUtil.UNSAFE.copyMemory( - inAddress, outAddress + Decimal256Vector.TYPE_WIDTH - length, length); + MemoryUtil.copyMemory(inAddress, outAddress + Decimal256Vector.TYPE_WIDTH - length, length); // sign extend if (length < TYPE_WIDTH) { - byte msb = MemoryUtil.UNSAFE.getByte(inAddress); + byte msb = MemoryUtil.getByte(inAddress); final byte pad = (byte) (msb < 0 ? 0xFF : 0x00); - MemoryUtil.UNSAFE.setMemory(outAddress, Decimal256Vector.TYPE_WIDTH - length, pad); + MemoryUtil.setMemory(outAddress, Decimal256Vector.TYPE_WIDTH - length, pad); } } } @@ -323,23 +318,22 @@ public void setBigEndianSafe(int index, long start, ArrowBuf buffer, int length) if (LITTLE_ENDIAN) { // swap bytes to convert BE to LE for (int byteIdx = 0; byteIdx < length; ++byteIdx) { - byte val = MemoryUtil.UNSAFE.getByte((inAddress + length - 1) - byteIdx); - MemoryUtil.UNSAFE.putByte(outAddress + byteIdx, val); + byte val = MemoryUtil.getByte((inAddress + length - 1) - byteIdx); + MemoryUtil.putByte(outAddress + byteIdx, val); } // sign extend if (length < 32) { - byte msb = MemoryUtil.UNSAFE.getByte(inAddress); + byte msb = MemoryUtil.getByte(inAddress); final byte pad = (byte) (msb < 0 ? 0xFF : 0x00); - MemoryUtil.UNSAFE.setMemory(outAddress + length, Decimal256Vector.TYPE_WIDTH - length, pad); + MemoryUtil.setMemory(outAddress + length, Decimal256Vector.TYPE_WIDTH - length, pad); } } else { - MemoryUtil.UNSAFE.copyMemory( - inAddress, outAddress + Decimal256Vector.TYPE_WIDTH - length, length); + MemoryUtil.copyMemory(inAddress, outAddress + Decimal256Vector.TYPE_WIDTH - length, length); // sign extend if (length < TYPE_WIDTH) { - byte msb = MemoryUtil.UNSAFE.getByte(inAddress); + byte msb = MemoryUtil.getByte(inAddress); final byte pad = (byte) (msb < 0 ? 0xFF : 0x00); - MemoryUtil.UNSAFE.setMemory(outAddress, Decimal256Vector.TYPE_WIDTH - length, pad); + MemoryUtil.setMemory(outAddress, Decimal256Vector.TYPE_WIDTH - length, pad); } } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/DecimalVector.java b/java/vector/src/main/java/org/apache/arrow/vector/DecimalVector.java index c2f4a14de7cc7..b4c55680b7305 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/DecimalVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/DecimalVector.java @@ -212,13 +212,13 @@ public void setBigEndian(int index, byte[] value) { long outAddress = valueBuffer.memoryAddress() + (long) index * TYPE_WIDTH; if (length == 0) { - MemoryUtil.UNSAFE.setMemory(outAddress, DecimalVector.TYPE_WIDTH, (byte) 0); + MemoryUtil.setMemory(outAddress, DecimalVector.TYPE_WIDTH, (byte) 0); return; } if (LITTLE_ENDIAN) { // swap bytes to convert BE to LE for (int byteIdx = 0; byteIdx < length; ++byteIdx) { - MemoryUtil.UNSAFE.putByte(outAddress + byteIdx, value[length - 1 - byteIdx]); + MemoryUtil.putByte(outAddress + byteIdx, value[length - 1 - byteIdx]); } if (length == TYPE_WIDTH) { @@ -228,21 +228,16 @@ public void setBigEndian(int index, byte[] value) { if (length < TYPE_WIDTH) { // sign extend final byte pad = (byte) (value[0] < 0 ? 0xFF : 0x00); - MemoryUtil.UNSAFE.setMemory(outAddress + length, DecimalVector.TYPE_WIDTH - length, pad); + MemoryUtil.setMemory(outAddress + length, DecimalVector.TYPE_WIDTH - length, pad); return; } } else { if (length <= TYPE_WIDTH) { // copy data from value to outAddress - MemoryUtil.UNSAFE.copyMemory( - value, - MemoryUtil.BYTE_ARRAY_BASE_OFFSET, - null, - outAddress + DecimalVector.TYPE_WIDTH - length, - length); + MemoryUtil.copyToMemory(value, 0, outAddress + DecimalVector.TYPE_WIDTH - length, length); // sign extend final byte pad = (byte) (value[0] < 0 ? 0xFF : 0x00); - MemoryUtil.UNSAFE.setMemory(outAddress, DecimalVector.TYPE_WIDTH - length, pad); + MemoryUtil.setMemory(outAddress, DecimalVector.TYPE_WIDTH - length, pad); return; } } @@ -281,21 +276,20 @@ public void setSafe(int index, long start, ArrowBuf buffer, int length) { long inAddress = buffer.memoryAddress() + start; long outAddress = valueBuffer.memoryAddress() + (long) index * TYPE_WIDTH; if (LITTLE_ENDIAN) { - MemoryUtil.UNSAFE.copyMemory(inAddress, outAddress, length); + MemoryUtil.copyMemory(inAddress, outAddress, length); // sign extend if (length < TYPE_WIDTH) { - byte msb = MemoryUtil.UNSAFE.getByte(inAddress + length - 1); + byte msb = MemoryUtil.getByte(inAddress + length - 1); final byte pad = (byte) (msb < 0 ? 0xFF : 0x00); - MemoryUtil.UNSAFE.setMemory(outAddress + length, DecimalVector.TYPE_WIDTH - length, pad); + MemoryUtil.setMemory(outAddress + length, DecimalVector.TYPE_WIDTH - length, pad); } } else { - MemoryUtil.UNSAFE.copyMemory( - inAddress, outAddress + DecimalVector.TYPE_WIDTH - length, length); + MemoryUtil.copyMemory(inAddress, outAddress + DecimalVector.TYPE_WIDTH - length, length); // sign extend if (length < TYPE_WIDTH) { - byte msb = MemoryUtil.UNSAFE.getByte(inAddress); + byte msb = MemoryUtil.getByte(inAddress); final byte pad = (byte) (msb < 0 ? 0xFF : 0x00); - MemoryUtil.UNSAFE.setMemory(outAddress, DecimalVector.TYPE_WIDTH - length, pad); + MemoryUtil.setMemory(outAddress, DecimalVector.TYPE_WIDTH - length, pad); } } } @@ -322,23 +316,22 @@ public void setBigEndianSafe(int index, long start, ArrowBuf buffer, int length) if (LITTLE_ENDIAN) { // swap bytes to convert BE to LE for (int byteIdx = 0; byteIdx < length; ++byteIdx) { - byte val = MemoryUtil.UNSAFE.getByte((inAddress + length - 1) - byteIdx); - MemoryUtil.UNSAFE.putByte(outAddress + byteIdx, val); + byte val = MemoryUtil.getByte((inAddress + length - 1) - byteIdx); + MemoryUtil.putByte(outAddress + byteIdx, val); } // sign extend if (length < TYPE_WIDTH) { - byte msb = MemoryUtil.UNSAFE.getByte(inAddress); + byte msb = MemoryUtil.getByte(inAddress); final byte pad = (byte) (msb < 0 ? 0xFF : 0x00); - MemoryUtil.UNSAFE.setMemory(outAddress + length, DecimalVector.TYPE_WIDTH - length, pad); + MemoryUtil.setMemory(outAddress + length, DecimalVector.TYPE_WIDTH - length, pad); } } else { - MemoryUtil.UNSAFE.copyMemory( - inAddress, outAddress + DecimalVector.TYPE_WIDTH - length, length); + MemoryUtil.copyMemory(inAddress, outAddress + DecimalVector.TYPE_WIDTH - length, length); // sign extend if (length < TYPE_WIDTH) { - byte msb = MemoryUtil.UNSAFE.getByte(inAddress); + byte msb = MemoryUtil.getByte(inAddress); final byte pad = (byte) (msb < 0 ? 0xFF : 0x00); - MemoryUtil.UNSAFE.setMemory(outAddress, DecimalVector.TYPE_WIDTH - length, pad); + MemoryUtil.setMemory(outAddress, DecimalVector.TYPE_WIDTH - length, pad); } } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java b/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java index dd86b58b267fb..31b79fe53a4a5 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java @@ -170,15 +170,15 @@ public static void writeLongToArrowBuf(long value, ArrowBuf bytebuf, int index, final long addressOfValue = bytebuf.memoryAddress() + (long) index * byteWidth; final long padValue = Long.signum(value) == -1 ? -1L : 0L; if (LITTLE_ENDIAN) { - MemoryUtil.UNSAFE.putLong(addressOfValue, value); + MemoryUtil.putLong(addressOfValue, value); for (int i = 1; i <= (byteWidth - 8) / 8; i++) { - MemoryUtil.UNSAFE.putLong(addressOfValue + Long.BYTES * i, padValue); + MemoryUtil.putLong(addressOfValue + Long.BYTES * i, padValue); } } else { for (int i = 0; i < (byteWidth - 8) / 8; i++) { - MemoryUtil.UNSAFE.putLong(addressOfValue + Long.BYTES * i, padValue); + MemoryUtil.putLong(addressOfValue + Long.BYTES * i, padValue); } - MemoryUtil.UNSAFE.putLong(addressOfValue + Long.BYTES * (byteWidth - 8) / 8, value); + MemoryUtil.putLong(addressOfValue + Long.BYTES * (byteWidth - 8) / 8, value); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java b/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java index 4f81cba55f1b3..e703571b374eb 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java @@ -93,7 +93,7 @@ public ValueVector visit(BaseFixedWidthVector deltaVector, Void value) { targetVector.getDataBuffer()); } else { - MemoryUtil.UNSAFE.copyMemory( + MemoryUtil.copyMemory( deltaVector.getDataBuffer().memoryAddress(), targetVector.getDataBuffer().memoryAddress() + deltaVector.getTypeWidth() * targetVector.getValueCount(), @@ -142,13 +142,13 @@ public ValueVector visit(BaseVariableWidthVector deltaVector, Void value) { targetVector.getValidityBuffer()); // append data buffer - MemoryUtil.UNSAFE.copyMemory( + MemoryUtil.copyMemory( deltaVector.getDataBuffer().memoryAddress(), targetVector.getDataBuffer().memoryAddress() + targetDataSize, deltaDataSize); // copy offset buffer - MemoryUtil.UNSAFE.copyMemory( + MemoryUtil.copyMemory( deltaVector.getOffsetBuffer().memoryAddress() + BaseVariableWidthVector.OFFSET_WIDTH, targetVector.getOffsetBuffer().memoryAddress() + (targetVector.getValueCount() + 1) * BaseVariableWidthVector.OFFSET_WIDTH, @@ -214,13 +214,13 @@ public ValueVector visit(BaseLargeVariableWidthVector deltaVector, Void value) { targetVector.getValidityBuffer()); // append data buffer - MemoryUtil.UNSAFE.copyMemory( + MemoryUtil.copyMemory( deltaVector.getDataBuffer().memoryAddress(), targetVector.getDataBuffer().memoryAddress() + targetDataSize, deltaDataSize); // copy offset buffer - MemoryUtil.UNSAFE.copyMemory( + MemoryUtil.copyMemory( deltaVector.getOffsetBuffer().memoryAddress() + BaseLargeVariableWidthVector.OFFSET_WIDTH, targetVector.getOffsetBuffer().memoryAddress() + (targetVector.getValueCount() + 1) * BaseLargeVariableWidthVector.OFFSET_WIDTH, @@ -292,7 +292,7 @@ public ValueVector visit(ListVector deltaVector, Void value) { targetVector.getValidityBuffer()); // append offset buffer - MemoryUtil.UNSAFE.copyMemory( + MemoryUtil.copyMemory( deltaVector.getOffsetBuffer().memoryAddress() + ListVector.OFFSET_WIDTH, targetVector.getOffsetBuffer().memoryAddress() + (targetVector.getValueCount() + 1) * ListVector.OFFSET_WIDTH, @@ -362,7 +362,7 @@ public ValueVector visit(LargeListVector deltaVector, Void value) { targetVector.getValidityBuffer()); // append offset buffer - MemoryUtil.UNSAFE.copyMemory( + MemoryUtil.copyMemory( deltaVector.getOffsetBuffer().memoryAddress() + ListVector.OFFSET_WIDTH, targetVector.getOffsetBuffer().memoryAddress() + (targetVector.getValueCount() + 1) * LargeListVector.OFFSET_WIDTH, @@ -499,7 +499,7 @@ public ValueVector visit(UnionVector deltaVector, Void value) { } // append type buffers - MemoryUtil.UNSAFE.copyMemory( + MemoryUtil.copyMemory( deltaVector.getTypeBufferAddress(), targetUnionVector.getTypeBufferAddress() + targetVector.getValueCount(), deltaVector.getValueCount()); @@ -564,7 +564,7 @@ public ValueVector visit(DenseUnionVector deltaVector, Void value) { } // append type buffers - MemoryUtil.UNSAFE.copyMemory( + MemoryUtil.copyMemory( deltaVector.getTypeBuffer().memoryAddress(), targetDenseUnionVector.getTypeBuffer().memoryAddress() + targetVector.getValueCount(), deltaVector.getValueCount()); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestBitVectorHelper.java b/java/vector/src/test/java/org/apache/arrow/vector/TestBitVectorHelper.java index f17c065d4e2df..68aa61962ba3f 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestBitVectorHelper.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestBitVectorHelper.java @@ -115,34 +115,34 @@ public void testAllBitsSet() { try (RootAllocator allocator = new RootAllocator(bufferLength); ArrowBuf validityBuffer = allocator.buffer(bufferLength)) { - MemoryUtil.UNSAFE.setMemory(validityBuffer.memoryAddress(), bufferLength, (byte) -1); + MemoryUtil.setMemory(validityBuffer.memoryAddress(), bufferLength, (byte) -1); int bitLength = 1024; assertTrue(BitVectorHelper.checkAllBitsEqualTo(validityBuffer, bitLength, true)); bitLength = 1028; assertTrue(BitVectorHelper.checkAllBitsEqualTo(validityBuffer, bitLength, true)); - MemoryUtil.UNSAFE.setMemory(validityBuffer.memoryAddress(), bufferLength, (byte) -1); + MemoryUtil.setMemory(validityBuffer.memoryAddress(), bufferLength, (byte) -1); bitLength = 1025; BitVectorHelper.unsetBit(validityBuffer, 12); assertFalse(BitVectorHelper.checkAllBitsEqualTo(validityBuffer, bitLength, true)); - MemoryUtil.UNSAFE.setMemory(validityBuffer.memoryAddress(), bufferLength, (byte) -1); + MemoryUtil.setMemory(validityBuffer.memoryAddress(), bufferLength, (byte) -1); bitLength = 1025; BitVectorHelper.unsetBit(validityBuffer, 1024); assertFalse(BitVectorHelper.checkAllBitsEqualTo(validityBuffer, bitLength, true)); - MemoryUtil.UNSAFE.setMemory(validityBuffer.memoryAddress(), bufferLength, (byte) -1); + MemoryUtil.setMemory(validityBuffer.memoryAddress(), bufferLength, (byte) -1); bitLength = 1026; BitVectorHelper.unsetBit(validityBuffer, 1024); assertFalse(BitVectorHelper.checkAllBitsEqualTo(validityBuffer, bitLength, true)); - MemoryUtil.UNSAFE.setMemory(validityBuffer.memoryAddress(), bufferLength, (byte) -1); + MemoryUtil.setMemory(validityBuffer.memoryAddress(), bufferLength, (byte) -1); bitLength = 1027; BitVectorHelper.unsetBit(validityBuffer, 1025); assertFalse(BitVectorHelper.checkAllBitsEqualTo(validityBuffer, bitLength, true)); - MemoryUtil.UNSAFE.setMemory(validityBuffer.memoryAddress(), bufferLength, (byte) -1); + MemoryUtil.setMemory(validityBuffer.memoryAddress(), bufferLength, (byte) -1); bitLength = 1031; BitVectorHelper.unsetBit(validityBuffer, 1029); BitVectorHelper.unsetBit(validityBuffer, 1030); From 7e50097ba4239cf9368b77f438d877d4176141c9 Mon Sep 17 00:00:00 2001 From: Costi Ciudatu Date: Tue, 30 Jul 2024 22:30:57 +0300 Subject: [PATCH 47/73] GH-43469: [Java] Change the default CompressionCodec.Factory to leverage compression support transparently (#43471) ### Rationale for this change Add compression support to Flight RPC and others by just including the `arrow-compression` jar in the module path (or classpath). ### What changes are included in this PR? Change the default compression factory to the new `CompressionCodec.Factory.INSTANCE`, a ServiceLoader-backed singleton that delegates to the best suited available implementation in the module/class path for each codec type. ### Are these changes tested? yes ### Are there any user-facing changes? No. * GitHub Issue: #43469 Authored-by: Costi Ciudatu Signed-off-by: Dane Pitkin --- .../src/main/java/module-info.java | 6 +++ ...ector.compression.CompressionCodec$Factory | 15 ++++++ .../TestCompressionCodecServiceProvider.java | 50 +++++++++++++++++++ java/vector/src/main/java/module-info.java | 2 + .../org/apache/arrow/vector/VectorLoader.java | 2 +- .../vector/compression/CompressionCodec.java | 44 ++++++++++++++++ .../arrow/vector/ipc/ArrowFileReader.java | 3 +- .../apache/arrow/vector/ipc/ArrowReader.java | 3 +- .../arrow/vector/ipc/ArrowStreamReader.java | 3 +- 9 files changed, 121 insertions(+), 7 deletions(-) create mode 100644 java/compression/src/main/resources/META-INF/services/org.apache.arrow.vector.compression.CompressionCodec$Factory create mode 100644 java/compression/src/test/java/org/apache/arrow/compression/TestCompressionCodecServiceProvider.java diff --git a/java/compression/src/main/java/module-info.java b/java/compression/src/main/java/module-info.java index 6bf989e4c142e..113a1dba9d45f 100644 --- a/java/compression/src/main/java/module-info.java +++ b/java/compression/src/main/java/module-info.java @@ -15,6 +15,8 @@ * limitations under the License. */ +import org.apache.arrow.vector.compression.CompressionCodec; + module org.apache.arrow.compression { exports org.apache.arrow.compression; @@ -22,4 +24,8 @@ requires org.apache.arrow.memory.core; requires org.apache.arrow.vector; requires org.apache.commons.compress; + + // Also defined under META-INF/services to support non-modular applications + provides CompressionCodec.Factory with + org.apache.arrow.compression.CommonsCompressionFactory; } diff --git a/java/compression/src/main/resources/META-INF/services/org.apache.arrow.vector.compression.CompressionCodec$Factory b/java/compression/src/main/resources/META-INF/services/org.apache.arrow.vector.compression.CompressionCodec$Factory new file mode 100644 index 0000000000000..ccdcef9aed96a --- /dev/null +++ b/java/compression/src/main/resources/META-INF/services/org.apache.arrow.vector.compression.CompressionCodec$Factory @@ -0,0 +1,15 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +org.apache.arrow.compression.CommonsCompressionFactory diff --git a/java/compression/src/test/java/org/apache/arrow/compression/TestCompressionCodecServiceProvider.java b/java/compression/src/test/java/org/apache/arrow/compression/TestCompressionCodecServiceProvider.java new file mode 100644 index 0000000000000..795e05d7cb123 --- /dev/null +++ b/java/compression/src/test/java/org/apache/arrow/compression/TestCompressionCodecServiceProvider.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.compression; + +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import org.apache.arrow.vector.compression.CompressionCodec; +import org.apache.arrow.vector.compression.CompressionUtil; +import org.apache.arrow.vector.compression.NoCompressionCodec; +import org.junit.jupiter.api.Test; + +public class TestCompressionCodecServiceProvider { + + /** + * When arrow-compression is in the classpath/module-path, {@link + * CompressionCodec.Factory#INSTANCE} should be able to handle all codec types. + */ + @Test + public void testSupportedCompressionTypes() { + assertThrows( // no-compression doesn't support any actual compression types + IllegalArgumentException.class, + () -> checkAllCodecTypes(NoCompressionCodec.Factory.INSTANCE)); + assertThrows( // commons-compression doesn't support the uncompressed type + IllegalArgumentException.class, + () -> checkAllCodecTypes(CommonsCompressionFactory.INSTANCE)); + checkAllCodecTypes( // and the winner is... + CompressionCodec.Factory.INSTANCE); // combines the two above to support all types + } + + private void checkAllCodecTypes(CompressionCodec.Factory factory) { + for (CompressionUtil.CodecType codecType : CompressionUtil.CodecType.values()) { + assertNotNull(factory.createCodec(codecType)); + } + } +} diff --git a/java/vector/src/main/java/module-info.java b/java/vector/src/main/java/module-info.java index 73af2d1b67efd..fdea2bd06726e 100644 --- a/java/vector/src/main/java/module-info.java +++ b/java/vector/src/main/java/module-info.java @@ -47,4 +47,6 @@ requires org.apache.arrow.memory.core; requires org.apache.commons.codec; requires org.slf4j; + + uses org.apache.arrow.vector.compression.CompressionCodec.Factory; } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java b/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java index c076161bc21d6..ecd3fb91241b1 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java @@ -50,7 +50,7 @@ public class VectorLoader { * @param root the root to add vectors to based on schema */ public VectorLoader(VectorSchemaRoot root) { - this(root, NoCompressionCodec.Factory.INSTANCE); + this(root, CompressionCodec.Factory.INSTANCE); } /** diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compression/CompressionCodec.java b/java/vector/src/main/java/org/apache/arrow/vector/compression/CompressionCodec.java index 2de8ff246591b..dd62108a84a6b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/compression/CompressionCodec.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/compression/CompressionCodec.java @@ -16,6 +16,9 @@ */ package org.apache.arrow.vector.compression; +import java.util.EnumMap; +import java.util.Map; +import java.util.ServiceLoader; import org.apache.arrow.memory.ArrowBuf; import org.apache.arrow.memory.BufferAllocator; @@ -51,11 +54,52 @@ public interface CompressionCodec { /** Factory to create compression codec. */ interface Factory { + /** + * This combines all the available factories registered as service providers in the module path. + * For each {@link CompressionUtil.CodecType compression codec type}, it will use whatever + * factory supports it, i.e. doesn't throw on `createCodec(type)`. If multiple factories + * registered as service providers support the same codec type, the first one encountered while + * iterating over the {@link ServiceLoader} will be selected. A codec type that is not supported + * by any registered service provider will fall back to {@link + * NoCompressionCodec.Factory#INSTANCE} for backwards compatibility. + */ + Factory INSTANCE = bestEffort(); /** Creates the codec based on the codec type. */ CompressionCodec createCodec(CompressionUtil.CodecType codecType); /** Creates the codec based on the codec type and compression level. */ CompressionCodec createCodec(CompressionUtil.CodecType codecType, int compressionLevel); + + private static Factory bestEffort() { + final ServiceLoader serviceLoader = ServiceLoader.load(Factory.class); + final Map factories = + new EnumMap<>(CompressionUtil.CodecType.class); + for (Factory factory : serviceLoader) { + for (CompressionUtil.CodecType codecType : CompressionUtil.CodecType.values()) { + try { + factory.createCodec(codecType); // will throw if not supported + factories.putIfAbsent(codecType, factory); + } catch (Throwable ignored) { + } + } + } + + final Factory fallback = NoCompressionCodec.Factory.INSTANCE; + return new Factory() { + @Override + public CompressionCodec createCodec(CompressionUtil.CodecType codecType) { + return factories.getOrDefault(codecType, fallback).createCodec(codecType); + } + + @Override + public CompressionCodec createCodec( + CompressionUtil.CodecType codecType, int compressionLevel) { + return factories + .getOrDefault(codecType, fallback) + .createCodec(codecType, compressionLevel); + } + }; + } } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowFileReader.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowFileReader.java index 982651b2ff3de..7cac0a15a198e 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowFileReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowFileReader.java @@ -27,7 +27,6 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.util.VisibleForTesting; import org.apache.arrow.vector.compression.CompressionCodec; -import org.apache.arrow.vector.compression.NoCompressionCodec; import org.apache.arrow.vector.ipc.message.ArrowBlock; import org.apache.arrow.vector.ipc.message.ArrowDictionaryBatch; import org.apache.arrow.vector.ipc.message.ArrowFooter; @@ -64,7 +63,7 @@ public ArrowFileReader( } public ArrowFileReader(SeekableReadChannel in, BufferAllocator allocator) { - this(in, allocator, NoCompressionCodec.Factory.INSTANCE); + this(in, allocator, CompressionCodec.Factory.INSTANCE); } public ArrowFileReader(SeekableByteChannel in, BufferAllocator allocator) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowReader.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowReader.java index 15ade38cd3d62..7f4addf2d0dea 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowReader.java @@ -28,7 +28,6 @@ import org.apache.arrow.vector.VectorLoader; import org.apache.arrow.vector.VectorSchemaRoot; import org.apache.arrow.vector.compression.CompressionCodec; -import org.apache.arrow.vector.compression.NoCompressionCodec; import org.apache.arrow.vector.dictionary.Dictionary; import org.apache.arrow.vector.dictionary.DictionaryProvider; import org.apache.arrow.vector.ipc.message.ArrowDictionaryBatch; @@ -50,7 +49,7 @@ public abstract class ArrowReader implements DictionaryProvider, AutoCloseable { private final CompressionCodec.Factory compressionFactory; protected ArrowReader(BufferAllocator allocator) { - this(allocator, NoCompressionCodec.Factory.INSTANCE); + this(allocator, CompressionCodec.Factory.INSTANCE); } protected ArrowReader(BufferAllocator allocator, CompressionCodec.Factory compressionFactory) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowStreamReader.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowStreamReader.java index 660c6a5f8986a..69811dc71727c 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowStreamReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowStreamReader.java @@ -25,7 +25,6 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.compression.CompressionCodec; -import org.apache.arrow.vector.compression.NoCompressionCodec; import org.apache.arrow.vector.ipc.message.ArrowDictionaryBatch; import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; import org.apache.arrow.vector.ipc.message.MessageChannelReader; @@ -65,7 +64,7 @@ public ArrowStreamReader( * @param allocator to allocate new buffers */ public ArrowStreamReader(MessageChannelReader messageReader, BufferAllocator allocator) { - this(messageReader, allocator, NoCompressionCodec.Factory.INSTANCE); + this(messageReader, allocator, CompressionCodec.Factory.INSTANCE); } /** From 62fd98704dbe2684018707a7b135751fa7bfbe5a Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Wed, 31 Jul 2024 11:43:05 +0900 Subject: [PATCH 48/73] GH-43467: [C++] Add support for the official LZ4 CMake package (#43468) ### Rationale for this change LZ4 1.10.0 provides `LZ4::lz4` but LZ4 1.9.4 provides only `LZ4::lz4_shared` and `LZ4::lz4_static`. So we need to prepare `LZ4::lz4` in our side. ### What changes are included in this PR? Define `LZ4::lz4` by `LZ4::lz4_shared` or `LZ4::lz4_static` if `LZ4::lz4` doesn't exist. ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. * GitHub Issue: #43467 Authored-by: Sutou Kouhei Signed-off-by: Jacob Wujciak-Jens --- cpp/cmake_modules/Findlz4Alt.cmake | 12 +++++++++--- cpp/cmake_modules/ThirdpartyToolchain.cmake | 5 ++++- .../apache-arrow/apt/debian-bookworm/Dockerfile | 1 + .../apache-arrow/apt/debian-trixie/Dockerfile | 1 + .../apache-arrow/apt/ubuntu-focal/Dockerfile | 1 + .../apache-arrow/apt/ubuntu-jammy/Dockerfile | 1 + .../apache-arrow/apt/ubuntu-noble/Dockerfile | 1 + .../linux-packages/apache-arrow/debian/control.in | 2 ++ dev/tasks/linux-packages/apache-arrow/debian/rules | 4 +++- 9 files changed, 23 insertions(+), 5 deletions(-) diff --git a/cpp/cmake_modules/Findlz4Alt.cmake b/cpp/cmake_modules/Findlz4Alt.cmake index 77a22957f7964..91e735107a954 100644 --- a/cpp/cmake_modules/Findlz4Alt.cmake +++ b/cpp/cmake_modules/Findlz4Alt.cmake @@ -29,9 +29,15 @@ endif() find_package(lz4 ${find_package_args}) if(lz4_FOUND) set(lz4Alt_FOUND TRUE) - # Conan uses lz4::lz4 not LZ4::lz4 - if(NOT TARGET LZ4::lz4 AND TARGET lz4::lz4) - add_library(LZ4::lz4 ALIAS lz4::lz4) + if(NOT TARGET LZ4::lz4) + # Conan uses lz4::lz4 not LZ4::lz4 + if(TARGET lz4::lz4) + add_library(LZ4::lz4 ALIAS lz4::lz4) + elseif(ARROW_LZ4_USE_SHARED) + add_library(LZ4::lz4 ALIAS LZ4::lz4_shared) + else() + add_library(LZ4::lz4 ALIAS LZ4::lz4_static) + endif() endif() return() endif() diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 92bd80014e8ae..495aa70483605 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -4516,9 +4516,12 @@ function(build_orc) OFF CACHE BOOL "" FORCE) get_target_property(LZ4_INCLUDE_DIR LZ4::lz4 INTERFACE_INCLUDE_DIRECTORIES) + if(NOT LZ4_INCLUDE_DIR) + find_path(LZ4_INCLUDE_DIR NAMES lz4.h) + endif() get_filename_component(LZ4_ROOT "${LZ4_INCLUDE_DIR}" DIRECTORY) set(LZ4_HOME - ${LZ4_ROOT} + "${LZ4_ROOT}" CACHE STRING "" FORCE) set(LZ4_LIBRARY LZ4::lz4 diff --git a/dev/tasks/linux-packages/apache-arrow/apt/debian-bookworm/Dockerfile b/dev/tasks/linux-packages/apache-arrow/apt/debian-bookworm/Dockerfile index b38ee72d68c75..ec3bf7751d2d7 100644 --- a/dev/tasks/linux-packages/apache-arrow/apt/debian-bookworm/Dockerfile +++ b/dev/tasks/linux-packages/apache-arrow/apt/debian-bookworm/Dockerfile @@ -65,6 +65,7 @@ RUN \ libssl-dev \ libthrift-dev \ libutf8proc-dev \ + libxxhash-dev \ libzstd-dev \ llvm-dev \ lsb-release \ diff --git a/dev/tasks/linux-packages/apache-arrow/apt/debian-trixie/Dockerfile b/dev/tasks/linux-packages/apache-arrow/apt/debian-trixie/Dockerfile index 8a6accbfc8b16..c6a09da2dfd5c 100644 --- a/dev/tasks/linux-packages/apache-arrow/apt/debian-trixie/Dockerfile +++ b/dev/tasks/linux-packages/apache-arrow/apt/debian-trixie/Dockerfile @@ -66,6 +66,7 @@ RUN \ libssl-dev \ libthrift-dev \ libutf8proc-dev \ + libxxhash-dev \ libzstd-dev \ llvm-dev \ lsb-release \ diff --git a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-focal/Dockerfile b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-focal/Dockerfile index fdd0362680c5a..fe783638b6344 100644 --- a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-focal/Dockerfile +++ b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-focal/Dockerfile @@ -56,6 +56,7 @@ RUN \ libssl-dev \ libthrift-dev \ libutf8proc-dev \ + libxxhash-dev \ libzstd-dev \ llvm-dev \ lsb-release \ diff --git a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-jammy/Dockerfile b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-jammy/Dockerfile index e6718e59b0aba..1d9065d6b2e61 100644 --- a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-jammy/Dockerfile +++ b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-jammy/Dockerfile @@ -58,6 +58,7 @@ RUN \ libssl-dev \ libthrift-dev \ libutf8proc-dev \ + libxxhash-dev \ libzstd-dev \ llvm-dev \ lsb-release \ diff --git a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-noble/Dockerfile b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-noble/Dockerfile index 87ea2402456b0..f5f5e12f4d560 100644 --- a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-noble/Dockerfile +++ b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-noble/Dockerfile @@ -60,6 +60,7 @@ RUN \ libssl-dev \ libthrift-dev \ libutf8proc-dev \ + libxxhash-dev \ libzstd-dev \ llvm-dev \ lsb-release \ diff --git a/dev/tasks/linux-packages/apache-arrow/debian/control.in b/dev/tasks/linux-packages/apache-arrow/debian/control.in index 24e2839021aa8..cf3f488cc17e0 100644 --- a/dev/tasks/linux-packages/apache-arrow/debian/control.in +++ b/dev/tasks/linux-packages/apache-arrow/debian/control.in @@ -27,6 +27,7 @@ Build-Depends: libssl-dev, libthrift-dev, libutf8proc-dev, + libxxhash-dev, libzstd-dev, meson, ninja-build, @@ -152,6 +153,7 @@ Depends: libsnappy-dev, libssl-dev, libutf8proc-dev, + libxxhash-dev, libzstd-dev, nlohmann-json-dev | nlohmann-json3-dev, @USE_SYSTEM_GRPC@ protobuf-compiler-grpc, diff --git a/dev/tasks/linux-packages/apache-arrow/debian/rules b/dev/tasks/linux-packages/apache-arrow/debian/rules index 6c3074ab234e1..40877f44dbe66 100755 --- a/dev/tasks/linux-packages/apache-arrow/debian/rules +++ b/dev/tasks/linux-packages/apache-arrow/debian/rules @@ -107,8 +107,10 @@ override_dh_auto_test: # libarrow.so: avoid failing with "Unknown DWARF DW_OP_172" # libgandiva.so: avoid failing with "Unknown DWARF DW_OP_255" +# libparquet.so: avoid failing with "Unknown DWARF DW_OP_4" # See also: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=949296 override_dh_dwz: dh_dwz \ --exclude=libarrow.so \ - --exclude=libgandiva.so + --exclude=libgandiva.so \ + --exclude=libparquet.so From e588ed175f9058010d61c90c8b804b3f9bcf3be0 Mon Sep 17 00:00:00 2001 From: shinespiked <173708861+shinespiked@users.noreply.github.com> Date: Wed, 31 Jul 2024 09:35:30 -0400 Subject: [PATCH 49/73] GH-42014: [Python] Let StructArray.from_array accept a type in addition to names or fields (#43047) ### Rationale for this change StructArray.from_array currently accepts names or fields to create the struct array. However if you already have a struct type it's more convenient to pass that in and allow the function to use it to build the StructArray instead of the user having to pull out the fields themselves. ### What changes are included in this PR? Add a new argument to StructArray.from_array called structtype. The function will prevent both fields and structype from being passed by raising a ValueError. If structtype is not null then the existing fields argument is set from the structtype fields. This allows all of the existing code in the function to remain untouched. ### Are these changes tested? Yes. Testing creating the structarray from fields a test is added to make sure that a struct type can be used to create the array. ### Are there any user-facing changes? Yes, the StructArray.from_arrays function now has an extra optional argument * GitHub Issue: #42014 Authored-by: Akshay Subramanian <173708861+shinespiked@users.noreply.github.com> Signed-off-by: AlenkaF --- python/pyarrow/array.pxi | 14 ++++++++++++-- python/pyarrow/tests/test_array.py | 7 +++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index b1f90cd16537b..997f208a5dec4 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -3977,12 +3977,12 @@ cdef class StructArray(Array): @staticmethod def from_arrays(arrays, names=None, fields=None, mask=None, - memory_pool=None): + memory_pool=None, type=None): """ Construct StructArray from collection of arrays representing each field in the struct. - Either field names or field instances must be passed. + Either field names, field instances or a struct type must be passed. Parameters ---------- @@ -3995,6 +3995,8 @@ cdef class StructArray(Array): Indicate which values are null (True) or not null (False). memory_pool : MemoryPool (optional) For memory allocations, if required, otherwise uses default pool. + type : pyarrow.StructType (optional) + Struct type for name and type of each child. Returns ------- @@ -4013,6 +4015,14 @@ cdef class StructArray(Array): Field py_field DataType struct_type + if fields is not None and type is not None: + raise ValueError('Must pass either fields or type, not both') + + if type is not None: + fields = [] + for field in type: + fields.append(field) + if names is None and fields is None: raise ValueError('Must pass either names or fields') if names is not None and fields is not None: diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 30d258b9aabd8..c44ec3f8e1afe 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -707,6 +707,13 @@ def test_struct_from_arrays(): assert not arr.type[0].nullable assert arr.to_pylist() == expected_list + # From structtype + structtype = pa.struct([fa, fb, fc]) + arr = pa.StructArray.from_arrays([a, b, c], type=structtype) + assert arr.type == pa.struct([fa, fb, fc]) + assert not arr.type[0].nullable + assert arr.to_pylist() == expected_list + with pytest.raises(ValueError): pa.StructArray.from_arrays([a, b, c], fields=[fa, fb]) From c6be2df7dbd7bb536b35c6506a2dfd1b8b893fd5 Mon Sep 17 00:00:00 2001 From: mwish Date: Thu, 1 Aug 2024 02:12:35 +0800 Subject: [PATCH 50/73] GH-43444: [C++] Add benchmark for binary view builder (#43445) ### Rationale for this change Adding benchmark for BinaryViewBuilder ### What changes are included in this PR? Adding benchmark for BinaryViewBuilder ### Are these changes tested? No ### Are there any user-facing changes? No * GitHub Issue: #43444 Authored-by: mwish Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/array/builder_binary.h | 6 ++-- cpp/src/arrow/builder_benchmark.cc | 40 +++++++++++++++++++++++++++ cpp/src/arrow/util/binary_view_util.h | 19 +++++++++---- 3 files changed, 56 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index d825f7d32520a..442e4a26320a2 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -500,9 +500,9 @@ class ARROW_EXPORT StringHeapBuilder { ARROW_RETURN_NOT_OK(Reserve(length)); } - auto v = - util::ToBinaryView(value, static_cast(length), - static_cast(blocks_.size() - 1), current_offset_); + auto v = util::ToNonInlineBinaryView(value, static_cast(length), + static_cast(blocks_.size() - 1), + current_offset_); memcpy(current_out_buffer_, value, static_cast(length)); current_out_buffer_ += length; diff --git a/cpp/src/arrow/builder_benchmark.cc b/cpp/src/arrow/builder_benchmark.cc index 84f27d20ee038..8ec7373a1de1f 100644 --- a/cpp/src/arrow/builder_benchmark.cc +++ b/cpp/src/arrow/builder_benchmark.cc @@ -150,6 +150,44 @@ static void BuildBinaryArray(benchmark::State& state) { // NOLINT non-const ref state.SetItemsProcessed(state.iterations() * kItemsProcessed); } +static void BuildInlineBinaryViewArray( + benchmark::State& state) { // NOLINT non-const reference + std::string_view kBinaryStrings[] = {"1", "12345678", "12345", "123456789", + "12", "", " "}; + + for (auto _ : state) { + BinaryViewBuilder builder(memory_tracker.memory_pool()); + + for (int64_t i = 0; i < kRounds * kNumberOfElements; i++) { + ABORT_NOT_OK(builder.Append(kBinaryStrings[i % 7])); + } + + std::shared_ptr out; + ABORT_NOT_OK(builder.Finish(&out)); + } + + state.SetBytesProcessed(state.iterations() * kBytesProcessed); + state.SetItemsProcessed(state.iterations() * kItemsProcessed); +} + +static void BuildNonInlineBinaryViewArray( + benchmark::State& state) { // NOLINT non-const reference + const char* kLargeBinaryString = "12345678901234567890123456789012345678901234567890"; + for (auto _ : state) { + BinaryViewBuilder builder(memory_tracker.memory_pool()); + + for (int64_t i = 0; i < kRounds * kNumberOfElements; i++) { + ABORT_NOT_OK(builder.Append(kLargeBinaryString)); + } + + std::shared_ptr out; + ABORT_NOT_OK(builder.Finish(&out)); + } + + state.SetBytesProcessed(state.iterations() * kBytesProcessed); + state.SetItemsProcessed(state.iterations() * kItemsProcessed); +} + static void BuildChunkedBinaryArray( benchmark::State& state) { // NOLINT non-const reference // 1MB chunks @@ -458,6 +496,8 @@ BENCHMARK(BuildBinaryArray); BENCHMARK(BuildChunkedBinaryArray); BENCHMARK(BuildFixedSizeBinaryArray); BENCHMARK(BuildDecimalArray); +BENCHMARK(BuildInlineBinaryViewArray); +BENCHMARK(BuildNonInlineBinaryViewArray); BENCHMARK(BuildInt64DictionaryArrayRandom); BENCHMARK(BuildInt64DictionaryArraySequential); diff --git a/cpp/src/arrow/util/binary_view_util.h b/cpp/src/arrow/util/binary_view_util.h index 94f7a5bdfa667..2206918724969 100644 --- a/cpp/src/arrow/util/binary_view_util.h +++ b/cpp/src/arrow/util/binary_view_util.h @@ -26,6 +26,7 @@ namespace arrow::util { inline BinaryViewType::c_type ToInlineBinaryView(const void* data, int32_t size) { + assert(size <= BinaryViewType::kInlineSize); // Small string: inlined. Bytes beyond size are zeroed BinaryViewType::c_type out; out.inlined = {size, {}}; @@ -34,15 +35,13 @@ inline BinaryViewType::c_type ToInlineBinaryView(const void* data, int32_t size) } inline BinaryViewType::c_type ToInlineBinaryView(std::string_view v) { + assert(v.size() <= BinaryViewType::kInlineSize); return ToInlineBinaryView(v.data(), static_cast(v.size())); } -inline BinaryViewType::c_type ToBinaryView(const void* data, int32_t size, - int32_t buffer_index, int32_t offset) { - if (size <= BinaryViewType::kInlineSize) { - return ToInlineBinaryView(data, size); - } - +inline BinaryViewType::c_type ToNonInlineBinaryView(const void* data, int32_t size, + int32_t buffer_index, + int32_t offset) { // Large string: store index/offset. BinaryViewType::c_type out; out.ref = {size, {}, buffer_index, offset}; @@ -50,6 +49,14 @@ inline BinaryViewType::c_type ToBinaryView(const void* data, int32_t size, return out; } +inline BinaryViewType::c_type ToBinaryView(const void* data, int32_t size, + int32_t buffer_index, int32_t offset) { + if (size <= BinaryViewType::kInlineSize) { + return ToInlineBinaryView(data, size); + } + return ToNonInlineBinaryView(data, size, buffer_index, offset); +} + inline BinaryViewType::c_type ToBinaryView(std::string_view v, int32_t buffer_index, int32_t offset) { return ToBinaryView(v.data(), static_cast(v.size()), buffer_index, offset); From e9f6667291b68ef5d82b4a193fdd84c8ef06a2cf Mon Sep 17 00:00:00 2001 From: Joel Lubinitsky <33523178+joellubi@users.noreply.github.com> Date: Wed, 31 Jul 2024 15:07:15 -0500 Subject: [PATCH 51/73] GH-43443: [Go] [IPC] Infer schema from first record if not specified (#43484) ### Rationale for this change Fixes: #43443 Makes usage of the IPC writer and any writers that use it such the flight writer simpler. ### What changes are included in this PR? - Infer schema from first record if schema is not specified - IPC and Flight tests ### Are these changes tested? Yes ### Are there any user-facing changes? Any `ipc.Writer` that does not specify the optional argument `ipc.WithSchema` will no longer return an error as long as the incoming stream of records has a consistent schema. * GitHub Issue: #43443 Authored-by: Joel Lubinitsky Signed-off-by: Joel Lubinitsky --- go/arrow/flight/flight_test.go | 35 ++++++++++++++++++++++++++++++++++ go/arrow/ipc/writer.go | 8 ++++++-- go/arrow/ipc/writer_test.go | 19 ++++++++++++++++++ 3 files changed, 60 insertions(+), 2 deletions(-) diff --git a/go/arrow/flight/flight_test.go b/go/arrow/flight/flight_test.go index fe896f39a2b21..a03d839e9484d 100755 --- a/go/arrow/flight/flight_test.go +++ b/go/arrow/flight/flight_test.go @@ -23,11 +23,13 @@ import ( "io" "testing" + "github.com/apache/arrow/go/v18/arrow" "github.com/apache/arrow/go/v18/arrow/array" "github.com/apache/arrow/go/v18/arrow/flight" "github.com/apache/arrow/go/v18/arrow/internal/arrdata" "github.com/apache/arrow/go/v18/arrow/ipc" "github.com/apache/arrow/go/v18/arrow/memory" + "github.com/stretchr/testify/require" "google.golang.org/grpc" "google.golang.org/grpc/codes" "google.golang.org/grpc/credentials/insecure" @@ -449,3 +451,36 @@ func TestReaderError(t *testing.T) { t.Fatal("should have errored") } } + +func TestWriterInferSchema(t *testing.T) { + recs, ok := arrdata.Records["primitives"] + require.True(t, ok) + + fs := flightStreamWriter{} + w := flight.NewRecordWriter(&fs) + + for _, rec := range recs { + require.NoError(t, w.Write(rec)) + } + + require.NoError(t, w.Close()) +} + +func TestWriterInconsistentSchema(t *testing.T) { + recs, ok := arrdata.Records["primitives"] + require.True(t, ok) + + schema := arrow.NewSchema([]arrow.Field{{Name: "unknown", Type: arrow.PrimitiveTypes.Int8}}, nil) + fs := flightStreamWriter{} + w := flight.NewRecordWriter(&fs, ipc.WithSchema(schema)) + + require.ErrorContains(t, w.Write(recs[0]), "arrow/ipc: tried to write record batch with different schema") + require.NoError(t, w.Close()) +} + +type flightStreamWriter struct{} + +// Send implements flight.DataStreamWriter. +func (f *flightStreamWriter) Send(data *flight.FlightData) error { return nil } + +var _ flight.DataStreamWriter = (*flightStreamWriter)(nil) diff --git a/go/arrow/ipc/writer.go b/go/arrow/ipc/writer.go index ca4f77d35e17f..02c67635bb2fd 100644 --- a/go/arrow/ipc/writer.go +++ b/go/arrow/ipc/writer.go @@ -159,15 +159,19 @@ func (w *Writer) Write(rec arrow.Record) (err error) { } }() + incomingSchema := rec.Schema() + if !w.started { + if w.schema == nil { + w.schema = incomingSchema + } err := w.start() if err != nil { return err } } - schema := rec.Schema() - if schema == nil || !schema.Equal(w.schema) { + if incomingSchema == nil || !incomingSchema.Equal(w.schema) { return errInconsistentSchema } diff --git a/go/arrow/ipc/writer_test.go b/go/arrow/ipc/writer_test.go index e5683243e4546..60d811e68e87e 100644 --- a/go/arrow/ipc/writer_test.go +++ b/go/arrow/ipc/writer_test.go @@ -235,3 +235,22 @@ func TestWriteWithCompressionAndMinSavings(t *testing.T) { } } } + +func TestWriterInferSchema(t *testing.T) { + bldr := array.NewRecordBuilder(memory.DefaultAllocator, arrow.NewSchema([]arrow.Field{{Name: "col", Type: arrow.PrimitiveTypes.Int8}}, nil)) + bldr.Field(0).(*array.Int8Builder).AppendValues([]int8{1, 2, 3, 4, 5}, nil) + rec := bldr.NewRecord() + defer rec.Release() + + var buf bytes.Buffer + w := NewWriter(&buf) + + require.NoError(t, w.Write(rec)) + require.NoError(t, w.Close()) + + r, err := NewReader(&buf) + require.NoError(t, err) + defer r.Release() + + require.True(t, r.Schema().Equal(rec.Schema())) +} From 0dec116d83e4160aa3387fa87be2a99ac0fd3390 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 31 Jul 2024 17:09:44 -0400 Subject: [PATCH 52/73] MINOR: [R] Work around test failure in tidyquery revdep (#43498) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change See https://github.com/apache/arrow/issues/43317#issuecomment-2261299681. `tidyquery` is assembling queries in some way such that when `summarize.arrow_dplyr_query()` is called, the calling environment isn't a call, so `match.call()` fails. ### What changes are included in this PR? This PR wraps the `match.call()` call in a `try()`. The call is only used to do `abandon_ship()` on in-memory data anyway. So if the call is not available, it treats it like you're making a query on a Dataset and it tells you to `collect()` yourself. ### Are these changes tested? I couldn't figure out how to reproduce what was going on inside `tidyquery` to write a reproducer, and I don't think this is worth adding `tidyquery` to Suggests for. I confirmed locally that `tidyquery` tests pass with this change, so our revdeps should be clear. ### Are there any user-facing changes? 🙅 Authored-by: Neal Richardson Signed-off-by: Jonathan Keane --- r/R/dplyr-eval.R | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/r/R/dplyr-eval.R b/r/R/dplyr-eval.R index 1997d698c0b24..2dce24117a343 100644 --- a/r/R/dplyr-eval.R +++ b/r/R/dplyr-eval.R @@ -201,7 +201,12 @@ try_arrow_dplyr <- function(expr) { parent <- caller_env() # Make sure that the call is available in the parent environment # so that we can use it in abandon_ship, if needed - evalq(call <- match.call(), parent) + # (but don't error if we're in some weird context where we can't get the call, + # which could happen if you're code-generating or something?) + try( + evalq(call <- match.call(), parent), + silent = !getOption("arrow.debug", FALSE) + ) tryCatch( eval(expr, parent), @@ -217,7 +222,10 @@ try_arrow_dplyr <- function(expr) { # and that the function being called also exists in the dplyr namespace. abandon_ship <- function(err, env) { .data <- get(".data", envir = env) - if (query_on_dataset(.data)) { + # If there's no call (see comment in try_arrow_dplyr), we can't eval with + # dplyr even if the data is in memory already + call <- try(get("call", envir = env), silent = TRUE) + if (query_on_dataset(.data) || inherits(call, "try-error")) { # Add a note suggesting `collect()` to the error message. # If there are other suggestions already there (with the > arrow name), # collect() isn't the only suggestion, so message differently @@ -231,7 +239,6 @@ abandon_ship <- function(err, env) { } # Else, warn, collect(), and run in regular dplyr - call <- get("call", envir = env) rlang::warn( message = paste0("In ", format_expr(err$call), ": "), body = c("i" = conditionMessage(err), ">" = "Pulling data into R") From d4d92e4896d8108aef25c6ef199e87890d027b22 Mon Sep 17 00:00:00 2001 From: Vibhatha Lakmal Abeykoon Date: Thu, 1 Aug 2024 14:42:37 +0530 Subject: [PATCH 53/73] GH-41569: [Java] ListViewVector Implementation for UnionListViewReader (#43077) ### Rationale for this change This PR contains the multiple components which are mainly required to add the C Data interface for `ListViewVector`. This PR solves the following major issues associated with this exercise. #### What changes are included in this PR? - [x] https://github.com/apache/arrow/issues/41269 - [x] https://github.com/apache/arrow/issues/41270 Apart from that, the following features have also been added - [x] JSON Writer/Reader - [x] Complex Writer functionality ### Are these changes tested? Yes ### Are there any user-facing changes? Yes, we are introducing the usage of `listview` instead of `list`, `startListView` instead of `startList` and `endListView` instead of `endList` for `ListView` related APIs in building the `ListViewVector`. * GitHub Issue: #41569 Authored-by: Vibhatha Abeykoon Signed-off-by: David Li --- .../templates/AbstractFieldWriter.java | 22 + .../AbstractPromotableFieldWriter.java | 22 + .../main/codegen/templates/BaseWriter.java | 5 + .../main/codegen/templates/ComplexCopier.java | 7 + .../codegen/templates/DenseUnionWriter.java | 21 + .../templates/PromotableViewWriter.java | 167 +++ .../templates}/PromotableWriter.java | 142 +- .../main/codegen/templates/StructWriters.java | 28 + .../codegen/templates/UnionListWriter.java | 32 + .../main/codegen/templates/UnionReader.java | 13 + .../main/codegen/templates/UnionVector.java | 17 + .../codegen/templates/UnionViewWriter.java | 210 +++ .../main/codegen/templates/UnionWriter.java | 72 +- .../complex/AbstractContainerVector.java | 5 + .../complex/BaseRepeatedValueViewVector.java | 2 +- .../arrow/vector/complex/ListViewVector.java | 211 ++- .../complex/impl/ComplexWriterImpl.java | 40 + .../complex/impl/UnionListViewReader.java | 111 ++ .../arrow/vector/ipc/JsonFileReader.java | 8 +- .../arrow/vector/ipc/JsonFileWriter.java | 12 +- .../arrow/vector/TestListViewVector.java | 681 ++++++++-- .../apache/arrow/vector/TestValueVector.java | 33 + .../complex/writer/TestComplexWriter.java | 1170 ++++++++++++----- .../apache/arrow/vector/ipc/BaseFileTest.java | 16 +- .../testing/ValueVectorDataPopulator.java | 32 + 25 files changed, 2513 insertions(+), 566 deletions(-) create mode 100644 java/vector/src/main/codegen/templates/PromotableViewWriter.java rename java/vector/src/main/{java/org/apache/arrow/vector/complex/impl => codegen/templates}/PromotableWriter.java (81%) create mode 100644 java/vector/src/main/codegen/templates/UnionViewWriter.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListViewReader.java diff --git a/java/vector/src/main/codegen/templates/AbstractFieldWriter.java b/java/vector/src/main/codegen/templates/AbstractFieldWriter.java index 6c2368117f7c2..5ebfb6877fc5b 100644 --- a/java/vector/src/main/codegen/templates/AbstractFieldWriter.java +++ b/java/vector/src/main/codegen/templates/AbstractFieldWriter.java @@ -67,6 +67,16 @@ public void endList() { throw new IllegalStateException(String.format("You tried to end a list when you are using a ValueWriter of type %s.", this.getClass().getSimpleName())); } + @Override + public void startListView() { + throw new IllegalStateException(String.format("You tried to start a list view when you are using a ValueWriter of type %s.", this.getClass().getSimpleName())); + } + + @Override + public void endListView() { + throw new IllegalStateException(String.format("You tried to end a list view when you are using a ValueWriter of type %s.", this.getClass().getSimpleName())); + } + @Override public void startMap() { throw new IllegalStateException(String.format("You tried to start a map when you are using a ValueWriter of type %s.", this.getClass().getSimpleName())); @@ -184,6 +194,12 @@ public ListWriter list() { return null; } + @Override + public ListWriter listView() { + fail("ListView"); + return null; + } + @Override public MapWriter map() { fail("Map"); @@ -202,6 +218,12 @@ public ListWriter list(String name) { return null; } + @Override + public ListWriter listView(String name) { + fail("ListView"); + return null; + } + @Override public MapWriter map(String name) { fail("Map"); diff --git a/java/vector/src/main/codegen/templates/AbstractPromotableFieldWriter.java b/java/vector/src/main/codegen/templates/AbstractPromotableFieldWriter.java index 59f9fb5b8098d..06cb235f7dd99 100644 --- a/java/vector/src/main/codegen/templates/AbstractPromotableFieldWriter.java +++ b/java/vector/src/main/codegen/templates/AbstractPromotableFieldWriter.java @@ -76,6 +76,17 @@ public void endList() { setPosition(idx() + 1); } + @Override + public void startListView() { + getWriter(MinorType.LISTVIEW).startListView(); + } + + @Override + public void endListView() { + getWriter(MinorType.LISTVIEW).endListView(); + setPosition(idx() + 1); + } + @Override public void startMap() { getWriter(MinorType.MAP).startMap(); @@ -267,6 +278,11 @@ public ListWriter list() { return getWriter(MinorType.LIST).list(); } + @Override + public ListWriter listView() { + return getWriter(MinorType.LISTVIEW).listView(); + } + @Override public MapWriter map() { return getWriter(MinorType.LIST).map(); @@ -287,6 +303,11 @@ public ListWriter list(String name) { return getWriter(MinorType.STRUCT).list(name); } + @Override + public ListWriter listView(String name) { + return getWriter(MinorType.STRUCT).listView(name); + } + @Override public MapWriter map(String name) { return getWriter(MinorType.STRUCT).map(name); @@ -296,6 +317,7 @@ public MapWriter map(String name) { public MapWriter map(String name, boolean keysSorted) { return getWriter(MinorType.STRUCT).map(name, keysSorted); } + <#list vv.types as type><#list type.minor as minor> <#assign lowerName = minor.class?uncap_first /> <#if lowerName == "int" ><#assign lowerName = "integer" /> diff --git a/java/vector/src/main/codegen/templates/BaseWriter.java b/java/vector/src/main/codegen/templates/BaseWriter.java index 35df256b324b5..458a4df1eec82 100644 --- a/java/vector/src/main/codegen/templates/BaseWriter.java +++ b/java/vector/src/main/codegen/templates/BaseWriter.java @@ -62,6 +62,7 @@ public interface StructWriter extends BaseWriter { void copyReaderToField(String name, FieldReader reader); StructWriter struct(String name); ListWriter list(String name); + ListWriter listView(String name); MapWriter map(String name); MapWriter map(String name, boolean keysSorted); void start(); @@ -71,8 +72,11 @@ public interface StructWriter extends BaseWriter { public interface ListWriter extends BaseWriter { void startList(); void endList(); + void startListView(); + void endListView(); StructWriter struct(); ListWriter list(); + ListWriter listView(); MapWriter map(); MapWriter map(boolean keysSorted); void copyReader(FieldReader reader); @@ -106,6 +110,7 @@ public interface ComplexWriter { void copyReader(FieldReader reader); StructWriter rootAsStruct(); ListWriter rootAsList(); + ListWriter rootAsListView(); MapWriter rootAsMap(boolean keysSorted); void setPosition(int index); diff --git a/java/vector/src/main/codegen/templates/ComplexCopier.java b/java/vector/src/main/codegen/templates/ComplexCopier.java index 1a3ba940e7977..1eebba018b321 100644 --- a/java/vector/src/main/codegen/templates/ComplexCopier.java +++ b/java/vector/src/main/codegen/templates/ComplexCopier.java @@ -51,6 +51,7 @@ private static void writeValue(FieldReader reader, FieldWriter writer) { switch (mt) { case LIST: + case LISTVIEW: case LARGELIST: case FIXED_SIZE_LIST: if (reader.isSet()) { @@ -158,6 +159,8 @@ private static FieldWriter getStructWriterForReader(FieldReader reader, StructWr return (FieldWriter) writer.list(name); case MAP: return (FieldWriter) writer.map(name); + case LISTVIEW: + return (FieldWriter) writer.listView(name); default: throw new UnsupportedOperationException(reader.getMinorType().toString()); } @@ -180,6 +183,8 @@ private static FieldWriter getListWriterForReader(FieldReader reader, ListWriter case MAP: case NULL: return (FieldWriter) writer.list(); + case LISTVIEW: + return (FieldWriter) writer.listView(); default: throw new UnsupportedOperationException(reader.getMinorType().toString()); } @@ -201,6 +206,8 @@ private static FieldWriter getMapWriterForReader(FieldReader reader, MapWriter w case LIST: case NULL: return (FieldWriter) writer.list(); + case LISTVIEW: + return (FieldWriter) writer.listView(); case MAP: return (FieldWriter) writer.map(false); default: diff --git a/java/vector/src/main/codegen/templates/DenseUnionWriter.java b/java/vector/src/main/codegen/templates/DenseUnionWriter.java index e69a62a9e0f6f..8515b759e669e 100644 --- a/java/vector/src/main/codegen/templates/DenseUnionWriter.java +++ b/java/vector/src/main/codegen/templates/DenseUnionWriter.java @@ -83,6 +83,18 @@ public void endList() { getListWriter(typeId).endList(); } + @Override + public void startListView() { + byte typeId = data.getTypeId(idx()); + getListViewWriter(typeId).startList(); + } + + @Override + public void endListView() { + byte typeId = data.getTypeId(idx()); + getListViewWriter(typeId).endList(); + } + private StructWriter getStructWriter(byte typeId) { StructWriter structWriter = (StructWriter) writers[typeId]; if (structWriter == null) { @@ -106,6 +118,15 @@ private ListWriter getListWriter(byte typeId) { return listWriter; } + private ListWriter getListViewWriter(byte typeId) { + ListWriter listWriter = (ListWriter) writers[typeId]; + if (listWriter == null) { + listWriter = new UnionListViewWriter((ListViewVector) data.getVectorByType(typeId), nullableStructWriterFactory); + writers[typeId] = listWriter; + } + return listWriter; + } + public ListWriter asList(byte typeId) { data.setTypeId(idx(), typeId); return getListWriter(typeId); diff --git a/java/vector/src/main/codegen/templates/PromotableViewWriter.java b/java/vector/src/main/codegen/templates/PromotableViewWriter.java new file mode 100644 index 0000000000000..373abbe4b98f8 --- /dev/null +++ b/java/vector/src/main/codegen/templates/PromotableViewWriter.java @@ -0,0 +1,167 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +<@pp.dropOutputFile /> +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/PromotableViewWriter.java" /> + +<#include "/@includes/license.ftl" /> + + package org.apache.arrow.vector.complex.impl; + +import java.util.Locale; +<#include "/@includes/vv_imports.ftl" /> + +/** + * This FieldWriter implementation delegates all FieldWriter API calls to an inner FieldWriter. This + * inner field writer can start as a specific type, and this class will promote the writer to a + * UnionWriter if a call is made that the specifically typed writer cannot handle. A new UnionVector + * is created, wrapping the original vector, and replaces the original vector in the parent vector, + * which can be either an AbstractStructVector or a ListViewVector. + * + *

The writer used can either be for single elements (struct) or lists. + */ +public class PromotableViewWriter extends PromotableWriter { + + public PromotableViewWriter(ValueVector v, FixedSizeListVector fixedListVector) { + super(v, fixedListVector); + } + + public PromotableViewWriter(ValueVector v, FixedSizeListVector fixedListVector, + NullableStructWriterFactory nullableStructWriterFactory) { + super(v, fixedListVector, nullableStructWriterFactory); + } + + public PromotableViewWriter(ValueVector v, LargeListVector largeListVector) { + super(v, largeListVector); + } + + public PromotableViewWriter(ValueVector v, LargeListVector largeListVector, + NullableStructWriterFactory nullableStructWriterFactory) { + super(v, largeListVector, nullableStructWriterFactory); + } + + public PromotableViewWriter(ValueVector v, ListVector listVector) { + super(v, listVector); + } + + public PromotableViewWriter(ValueVector v, ListVector listVector, + NullableStructWriterFactory nullableStructWriterFactory) { + super(v, listVector, nullableStructWriterFactory); + } + + public PromotableViewWriter(ValueVector v, ListViewVector listViewVector, + NullableStructWriterFactory nullableStructWriterFactory) { + super(v, listViewVector, nullableStructWriterFactory); + } + + public PromotableViewWriter(ValueVector v, AbstractStructVector parentContainer) { + super(v, parentContainer); + } + + public PromotableViewWriter(ValueVector v, AbstractStructVector parentContainer, + NullableStructWriterFactory nullableStructWriterFactory) { + super(v, parentContainer, nullableStructWriterFactory); + } + + @Override + protected FieldWriter getWriter(MinorType type, ArrowType arrowType) { + if (state == State.UNION) { + if (requiresArrowType(type)) { + writer = ((UnionWriter) writer).toViewWriter(); + ((UnionViewWriter) writer).getWriter(type, arrowType); + } else { + writer = ((UnionWriter) writer).toViewWriter(); + ((UnionViewWriter) writer).getWriter(type); + } + } else if (state == State.UNTYPED) { + if (type == null) { + // ??? + return null; + } + if (arrowType == null) { + arrowType = type.getType(); + } + FieldType fieldType = new FieldType(addVectorAsNullable, arrowType, null, null); + ValueVector v; + if (listVector != null) { + v = listVector.addOrGetVector(fieldType).getVector(); + } else if (fixedListVector != null) { + v = fixedListVector.addOrGetVector(fieldType).getVector(); + } else if (listViewVector != null) { + v = listViewVector.addOrGetVector(fieldType).getVector(); + } else { + v = largeListVector.addOrGetVector(fieldType).getVector(); + } + v.allocateNew(); + setWriter(v); + writer.setPosition(position); + } else if (type != this.type) { + promoteToUnion(); + if (requiresArrowType(type)) { + writer = ((UnionWriter) writer).toViewWriter(); + ((UnionViewWriter) writer).getWriter(type, arrowType); + } else { + writer = ((UnionWriter) writer).toViewWriter(); + ((UnionViewWriter) writer).getWriter(type); + } + } + return writer; + } + + @Override + public StructWriter struct() { + return getWriter(MinorType.LISTVIEW).struct(); + } + + <#list vv.types as type><#list type.minor as minor> + <#assign lowerName = minor.class?uncap_first /> + <#if lowerName == "int" ><#assign lowerName = "integer" /> + <#assign upperName = minor.class?upper_case /> + <#assign capName = minor.class?cap_first /> + + @Override + public ${capName}Writer ${lowerName}() { + return getWriter(MinorType.LISTVIEW).${lowerName}(); + } + + + + @Override + public void allocate() { + getWriter().allocate(); + } + + @Override + public void clear() { + getWriter().clear(); + } + + @Override + public Field getField() { + return getWriter().getField(); + } + + @Override + public int getValueCapacity() { + return getWriter().getValueCapacity(); + } + + @Override + public void close() throws Exception { + getWriter().close(); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java b/java/vector/src/main/codegen/templates/PromotableWriter.java similarity index 81% rename from java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java rename to java/vector/src/main/codegen/templates/PromotableWriter.java index 7fd0def967388..82bd3c5345cdd 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java +++ b/java/vector/src/main/codegen/templates/PromotableWriter.java @@ -14,32 +14,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + +<@pp.dropOutputFile /> +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/PromotableWriter.java" /> + +<#include "/@includes/license.ftl" /> + package org.apache.arrow.vector.complex.impl; -import java.math.BigDecimal; -import java.nio.ByteBuffer; import java.util.Locale; -import org.apache.arrow.memory.ArrowBuf; -import org.apache.arrow.vector.FieldVector; -import org.apache.arrow.vector.NullVector; -import org.apache.arrow.vector.ValueVector; -import org.apache.arrow.vector.complex.AbstractStructVector; -import org.apache.arrow.vector.complex.FixedSizeListVector; -import org.apache.arrow.vector.complex.LargeListVector; -import org.apache.arrow.vector.complex.ListVector; -import org.apache.arrow.vector.complex.ListViewVector; -import org.apache.arrow.vector.complex.MapVector; -import org.apache.arrow.vector.complex.StructVector; -import org.apache.arrow.vector.complex.UnionVector; -import org.apache.arrow.vector.complex.writer.FieldWriter; -import org.apache.arrow.vector.holders.Decimal256Holder; -import org.apache.arrow.vector.holders.DecimalHolder; -import org.apache.arrow.vector.types.Types.MinorType; -import org.apache.arrow.vector.types.pojo.ArrowType; -import org.apache.arrow.vector.types.pojo.Field; -import org.apache.arrow.vector.types.pojo.FieldType; -import org.apache.arrow.vector.util.Text; -import org.apache.arrow.vector.util.TransferPair; +<#include "/@includes/vv_imports.ftl" /> /** * This FieldWriter implementation delegates all FieldWriter API calls to an inner FieldWriter. This @@ -52,27 +36,27 @@ */ public class PromotableWriter extends AbstractPromotableFieldWriter { - private final AbstractStructVector parentContainer; - private final ListVector listVector; - private final ListViewVector listViewVector; - private final FixedSizeListVector fixedListVector; - private final LargeListVector largeListVector; - private final NullableStructWriterFactory nullableStructWriterFactory; - private int position; - private static final int MAX_DECIMAL_PRECISION = 38; - private static final int MAX_DECIMAL256_PRECISION = 76; - - private enum State { + protected final AbstractStructVector parentContainer; + protected final ListVector listVector; + protected final ListViewVector listViewVector; + protected final FixedSizeListVector fixedListVector; + protected final LargeListVector largeListVector; + protected final NullableStructWriterFactory nullableStructWriterFactory; + protected int position; + protected static final int MAX_DECIMAL_PRECISION = 38; + protected static final int MAX_DECIMAL256_PRECISION = 76; + + protected enum State { UNTYPED, SINGLE, UNION } - private MinorType type; - private ValueVector vector; - private UnionVector unionVector; - private State state; - private FieldWriter writer; + protected MinorType type; + protected ValueVector vector; + protected UnionVector unionVector; + protected State state; + protected FieldWriter writer; /** * Constructs a new instance. @@ -234,7 +218,7 @@ public void setAddVectorAsNullable(boolean nullable) { } } - private void setWriter(ValueVector v) { + protected void setWriter(ValueVector v) { state = State.SINGLE; vector = v; type = v.getMinorType(); @@ -245,6 +229,9 @@ private void setWriter(ValueVector v) { case LIST: writer = new UnionListWriter((ListVector) vector, nullableStructWriterFactory); break; + case LISTVIEW: + writer = new UnionListViewWriter((ListViewVector) vector, nullableStructWriterFactory); + break; case MAP: writer = new UnionMapWriter((MapVector) vector); break; @@ -277,7 +264,7 @@ public void setPosition(int index) { } } - private boolean requiresArrowType(MinorType type) { + protected boolean requiresArrowType(MinorType type) { return type == MinorType.DECIMAL || type == MinorType.MAP || type == MinorType.DURATION @@ -336,7 +323,7 @@ protected FieldWriter getWriter() { return writer; } - private FieldWriter promoteToUnion() { + protected FieldWriter promoteToUnion() { String name = vector.getField().getName(); TransferPair tp = vector.getTransferPair( @@ -369,76 +356,76 @@ private FieldWriter promoteToUnion() { @Override public void write(DecimalHolder holder) { getWriter( - MinorType.DECIMAL, - new ArrowType.Decimal(MAX_DECIMAL_PRECISION, holder.scale, /*bitWidth=*/ 128)) + MinorType.DECIMAL, + new ArrowType.Decimal(MAX_DECIMAL_PRECISION, holder.scale, /*bitWidth=*/ 128)) .write(holder); } @Override public void writeDecimal(long start, ArrowBuf buffer, ArrowType arrowType) { getWriter( - MinorType.DECIMAL, - new ArrowType.Decimal( - MAX_DECIMAL_PRECISION, - ((ArrowType.Decimal) arrowType).getScale(), - /*bitWidth=*/ 128)) + MinorType.DECIMAL, + new ArrowType.Decimal( + MAX_DECIMAL_PRECISION, + ((ArrowType.Decimal) arrowType).getScale(), + /*bitWidth=*/ 128)) .writeDecimal(start, buffer, arrowType); } @Override public void writeDecimal(BigDecimal value) { getWriter( - MinorType.DECIMAL, - new ArrowType.Decimal(MAX_DECIMAL_PRECISION, value.scale(), /*bitWidth=*/ 128)) + MinorType.DECIMAL, + new ArrowType.Decimal(MAX_DECIMAL_PRECISION, value.scale(), /*bitWidth=*/ 128)) .writeDecimal(value); } @Override public void writeBigEndianBytesToDecimal(byte[] value, ArrowType arrowType) { getWriter( - MinorType.DECIMAL, - new ArrowType.Decimal( - MAX_DECIMAL_PRECISION, - ((ArrowType.Decimal) arrowType).getScale(), - /*bitWidth=*/ 128)) + MinorType.DECIMAL, + new ArrowType.Decimal( + MAX_DECIMAL_PRECISION, + ((ArrowType.Decimal) arrowType).getScale(), + /*bitWidth=*/ 128)) .writeBigEndianBytesToDecimal(value, arrowType); } @Override public void write(Decimal256Holder holder) { getWriter( - MinorType.DECIMAL256, - new ArrowType.Decimal(MAX_DECIMAL256_PRECISION, holder.scale, /*bitWidth=*/ 256)) + MinorType.DECIMAL256, + new ArrowType.Decimal(MAX_DECIMAL256_PRECISION, holder.scale, /*bitWidth=*/ 256)) .write(holder); } @Override public void writeDecimal256(long start, ArrowBuf buffer, ArrowType arrowType) { getWriter( - MinorType.DECIMAL256, - new ArrowType.Decimal( - MAX_DECIMAL256_PRECISION, - ((ArrowType.Decimal) arrowType).getScale(), - /*bitWidth=*/ 256)) + MinorType.DECIMAL256, + new ArrowType.Decimal( + MAX_DECIMAL256_PRECISION, + ((ArrowType.Decimal) arrowType).getScale(), + /*bitWidth=*/ 256)) .writeDecimal256(start, buffer, arrowType); } @Override public void writeDecimal256(BigDecimal value) { getWriter( - MinorType.DECIMAL256, - new ArrowType.Decimal(MAX_DECIMAL256_PRECISION, value.scale(), /*bitWidth=*/ 256)) + MinorType.DECIMAL256, + new ArrowType.Decimal(MAX_DECIMAL256_PRECISION, value.scale(), /*bitWidth=*/ 256)) .writeDecimal256(value); } @Override public void writeBigEndianBytesToDecimal256(byte[] value, ArrowType arrowType) { getWriter( - MinorType.DECIMAL256, - new ArrowType.Decimal( - MAX_DECIMAL256_PRECISION, - ((ArrowType.Decimal) arrowType).getScale(), - /*bitWidth=*/ 256)) + MinorType.DECIMAL256, + new ArrowType.Decimal( + MAX_DECIMAL256_PRECISION, + ((ArrowType.Decimal) arrowType).getScale(), + /*bitWidth=*/ 256)) .writeBigEndianBytesToDecimal256(value, arrowType); } @@ -526,4 +513,19 @@ public int getValueCapacity() { public void close() throws Exception { getWriter().close(); } + + /** + * Convert the writer to a PromotableViewWriter. + * + * @return The writer as a PromotableViewWriter. + */ + public PromotableViewWriter toViewWriter() { + PromotableViewWriter promotableViewWriter = new PromotableViewWriter(unionVector, parentContainer, nullableStructWriterFactory); + promotableViewWriter.position = position; + promotableViewWriter.writer = writer; + promotableViewWriter.state = state; + promotableViewWriter.unionVector = unionVector; + promotableViewWriter.type = MinorType.LISTVIEW; + return promotableViewWriter; + } } diff --git a/java/vector/src/main/codegen/templates/StructWriters.java b/java/vector/src/main/codegen/templates/StructWriters.java index b676173ac39d9..3e6258a0c6c0e 100644 --- a/java/vector/src/main/codegen/templates/StructWriters.java +++ b/java/vector/src/main/codegen/templates/StructWriters.java @@ -69,6 +69,9 @@ public class ${mode}StructWriter extends AbstractFieldWriter { case LIST: list(child.getName()); break; + case LISTVIEW: + listView(child.getName()); + break; case MAP: { ArrowType.Map arrowType = (ArrowType.Map) child.getType(); map(child.getName(), arrowType.getKeysSorted()); @@ -200,6 +203,31 @@ public ListWriter list(String name) { return writer; } + @Override + public ListWriter listView(String name) { + String finalName = handleCase(name); + FieldWriter writer = fields.get(finalName); + int vectorCount = container.size(); + if(writer == null) { + FieldType fieldType = new FieldType(addVectorAsNullable, MinorType.LISTVIEW.getType(), null, null); + writer = new PromotableViewWriter(container.addOrGet(name, fieldType, ListViewVector.class), container, getNullableStructWriterFactory()); + if (container.size() > vectorCount) { + writer.allocate(); + } + writer.setPosition(idx()); + fields.put(finalName, writer); + } else { + if (writer instanceof PromotableViewWriter) { + // ensure writers are initialized + ((PromotableViewWriter) writer).getWriter(MinorType.LISTVIEW); + } else { + writer = ((PromotableWriter) writer).toViewWriter(); + ((PromotableViewWriter) writer).getWriter(MinorType.LISTVIEW); + } + } + return writer; + } + @Override public MapWriter map(String name) { return map(name, false); diff --git a/java/vector/src/main/codegen/templates/UnionListWriter.java b/java/vector/src/main/codegen/templates/UnionListWriter.java index eeb964c055f71..e40c70eaffdc1 100644 --- a/java/vector/src/main/codegen/templates/UnionListWriter.java +++ b/java/vector/src/main/codegen/templates/UnionListWriter.java @@ -69,7 +69,11 @@ public class Union${listName}Writer extends AbstractFieldWriter { public Union${listName}Writer(${listName}Vector vector, NullableStructWriterFactory nullableStructWriterFactory) { this.vector = vector; + <#if listName = "ListView"> + this.writer = new PromotableViewWriter(vector.getDataVector(), vector, nullableStructWriterFactory); + <#else> this.writer = new PromotableWriter(vector.getDataVector(), vector, nullableStructWriterFactory); + } public Union${listName}Writer(${listName}Vector vector, AbstractFieldWriter parent) { @@ -154,6 +158,17 @@ public ListWriter list(String name) { return listWriter; } + @Override + public ListWriter listView() { + return writer; + } + + @Override + public ListWriter listView(String name) { + ListWriter listWriter = writer.listView(name); + return listWriter; + } + @Override public StructWriter struct(String name) { StructWriter structWriter = writer.struct(name); @@ -215,6 +230,23 @@ public void endList() { setPosition(idx() + 1); listStarted = false; } + + public void startListView() { + vector.startNewValue(idx()); + writer.setPosition(vector.getOffsetBuffer().getInt((idx()) * OFFSET_WIDTH)); + listStarted = true; + } + + @Override + public void endListView() { + int sizeUptoIdx = 0; + for (int i = 0; i < idx(); i++) { + sizeUptoIdx += vector.getSizeBuffer().getInt(i * SIZE_WIDTH); + } + vector.getSizeBuffer().setInt(idx() * SIZE_WIDTH, writer.idx() - sizeUptoIdx); + setPosition(idx() + 1); + listStarted = false; + } <#else> @Override public void startList() { diff --git a/java/vector/src/main/codegen/templates/UnionReader.java b/java/vector/src/main/codegen/templates/UnionReader.java index 243bd832255c2..615ea3a536a15 100644 --- a/java/vector/src/main/codegen/templates/UnionReader.java +++ b/java/vector/src/main/codegen/templates/UnionReader.java @@ -91,6 +91,8 @@ private FieldReader getReaderForIndex(int index) { return (FieldReader) getStruct(); case LIST: return (FieldReader) getList(); + case LISTVIEW: + return (FieldReader) getListView(); case MAP: return (FieldReader) getMap(); <#list vv.types as type> @@ -130,6 +132,17 @@ private FieldReader getList() { return listReader; } + private UnionListViewReader listViewReader; + + private FieldReader getListView() { + if (listViewReader == null) { + listViewReader = new UnionListViewReader(data.getListView()); + listViewReader.setPosition(idx()); + readers[MinorType.LISTVIEW.ordinal()] = listViewReader; + } + return listViewReader; + } + private UnionMapReader mapReader; private FieldReader getMap() { diff --git a/java/vector/src/main/codegen/templates/UnionVector.java b/java/vector/src/main/codegen/templates/UnionVector.java index ea79c5c2fba76..e0fd0e4644313 100644 --- a/java/vector/src/main/codegen/templates/UnionVector.java +++ b/java/vector/src/main/codegen/templates/UnionVector.java @@ -94,6 +94,7 @@ public class UnionVector extends AbstractContainerVector implements FieldVector private StructVector structVector; private ListVector listVector; + private ListViewVector listViewVector; private MapVector mapVector; private FieldReader reader; @@ -335,6 +336,20 @@ public ListVector getList() { return listVector; } + public ListViewVector getListView() { + if (listViewVector == null) { + int vectorCount = internalStruct.size(); + listViewVector = addOrGet(MinorType.LISTVIEW, ListViewVector.class); + if (internalStruct.size() > vectorCount) { + listViewVector.allocateNew(); + if (callBack != null) { + callBack.doWork(); + } + } + } + return listViewVector; + } + public MapVector getMap() { if (mapVector == null) { throw new IllegalArgumentException("No map present. Provide ArrowType argument to create a new vector"); @@ -702,6 +717,8 @@ public ValueVector getVectorByType(int typeId, ArrowType arrowType) { return getStruct(); case LIST: return getList(); + case LISTVIEW: + return getListView(); case MAP: return getMap(name, arrowType); default: diff --git a/java/vector/src/main/codegen/templates/UnionViewWriter.java b/java/vector/src/main/codegen/templates/UnionViewWriter.java new file mode 100644 index 0000000000000..7b834d8b6cd86 --- /dev/null +++ b/java/vector/src/main/codegen/templates/UnionViewWriter.java @@ -0,0 +1,210 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +<@pp.dropOutputFile /> +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/UnionViewWriter.java" /> + +package org.apache.arrow.vector.complex.impl; + +<#include "/@includes/vv_imports.ftl" /> +import org.apache.arrow.vector.complex.writer.BaseWriter; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.complex.impl.NullableStructWriterFactory; +import org.apache.arrow.vector.types.Types; + +<#function is_timestamp_tz type> + <#return type?starts_with("TimeStamp") && type?ends_with("TZ")> + + +/* + * This class is generated using freemarker and the ${.template_name} template. + */ +@SuppressWarnings("unused") +public class UnionViewWriter extends UnionWriter { + + public UnionViewWriter(UnionVector vector) { + this(vector, NullableStructWriterFactory.getNullableStructWriterFactoryInstance()); + } + + public UnionViewWriter(UnionVector vector, NullableStructWriterFactory nullableStructWriterFactory) { + super(vector, nullableStructWriterFactory); + } + + @Override + public StructWriter struct() { + data.setType(idx(), MinorType.LISTVIEW); + getListWriter().setPosition(idx()); + return getListWriter().struct(); + } + + <#list vv.types as type> + <#list type.minor as minor> + <#assign name = minor.class?cap_first /> + <#assign fields = minor.fields!type.fields /> + <#assign uncappedName = name?uncap_first/> + <#assign friendlyType = (minor.friendlyType!minor.boxedType!type.boxedType) /> + <#if !minor.typeParams?? || minor.class?starts_with("Decimal") || is_timestamp_tz(minor.class) || minor.class == "Duration" || minor.class == "FixedSizeBinary"> + + private ${name}Writer ${name?uncap_first}Writer; + + <#if minor.class?starts_with("Decimal") || is_timestamp_tz(minor.class) || minor.class == "Duration" || minor.class == "FixedSizeBinary"> + private ${name}Writer get${name}Writer(ArrowType arrowType) { + if (${uncappedName}Writer == null) { + ${uncappedName}Writer = new ${name}WriterImpl(data.get${name}Vector(arrowType)); + ${uncappedName}Writer.setPosition(idx()); + writers.add(${uncappedName}Writer); + } + return ${uncappedName}Writer; + } + + public ${name}Writer as${name}(ArrowType arrowType) { + data.setType(idx(), MinorType.${name?upper_case}); + return get${name}Writer(arrowType); + } + <#else> + private ${name}Writer get${name}Writer() { + if (${uncappedName}Writer == null) { + ${uncappedName}Writer = new ${name}WriterImpl(data.get${name}Vector()); + ${uncappedName}Writer.setPosition(idx()); + writers.add(${uncappedName}Writer); + } + return ${uncappedName}Writer; + } + + public ${name}Writer as${name}() { + data.setType(idx(), MinorType.${name?upper_case}); + return get${name}Writer(); + } + + + @Override + public void write(${name}Holder holder) { + data.setType(idx(), MinorType.${name?upper_case}); + <#if minor.class?starts_with("Decimal")> + ArrowType arrowType = new ArrowType.Decimal(holder.precision, holder.scale, ${name}Holder.WIDTH * 8); + get${name}Writer(arrowType).setPosition(idx()); + get${name}Writer(arrowType).write${name}(<#list fields as field>holder.${field.name}<#if field_has_next>, , arrowType); + <#elseif is_timestamp_tz(minor.class)> + ArrowType.Timestamp arrowTypeWithoutTz = (ArrowType.Timestamp) MinorType.${name?upper_case?remove_ending("TZ")}.getType(); + ArrowType arrowType = new ArrowType.Timestamp(arrowTypeWithoutTz.getUnit(), holder.timezone); + get${name}Writer(arrowType).setPosition(idx()); + get${name}Writer(arrowType).write(holder); + <#elseif minor.class == "Duration"> + ArrowType arrowType = new ArrowType.Duration(holder.unit); + get${name}Writer(arrowType).setPosition(idx()); + get${name}Writer(arrowType).write(holder); + <#elseif minor.class == "FixedSizeBinary"> + ArrowType arrowType = new ArrowType.FixedSizeBinary(holder.byteWidth); + get${name}Writer(arrowType).setPosition(idx()); + get${name}Writer(arrowType).write(holder); + <#else> + get${name}Writer().setPosition(idx()); + get${name}Writer().write${name}(<#list fields as field>holder.${field.name}<#if field_has_next>, ); + + } + + public void write${minor.class}(<#list fields as field>${field.type} ${field.name}<#if field_has_next>, <#if minor.class?starts_with("Decimal")>, ArrowType arrowType) { + data.setType(idx(), MinorType.${name?upper_case}); + <#if minor.class?starts_with("Decimal")> + get${name}Writer(arrowType).setPosition(idx()); + get${name}Writer(arrowType).write${name}(<#list fields as field>${field.name}<#if field_has_next>, , arrowType); + <#elseif is_timestamp_tz(minor.class)> + ArrowType.Timestamp arrowTypeWithoutTz = (ArrowType.Timestamp) MinorType.${name?upper_case?remove_ending("TZ")}.getType(); + ArrowType arrowType = new ArrowType.Timestamp(arrowTypeWithoutTz.getUnit(), "UTC"); + get${name}Writer(arrowType).setPosition(idx()); + get${name}Writer(arrowType).write${name}(<#list fields as field>${field.name}<#if field_has_next>, ); + <#elseif minor.class == "Duration" || minor.class == "FixedSizeBinary"> + // This is expected to throw. There's nothing more that we can do here since we can't infer any + // sort of default unit for the Duration or a default width for the FixedSizeBinary types. + ArrowType arrowType = MinorType.${name?upper_case}.getType(); + get${name}Writer(arrowType).setPosition(idx()); + get${name}Writer(arrowType).write${name}(<#list fields as field>${field.name}<#if field_has_next>, ); + <#else> + get${name}Writer().setPosition(idx()); + get${name}Writer().write${name}(<#list fields as field>${field.name}<#if field_has_next>, ); + + } + <#if minor.class?starts_with("Decimal")> + public void write${name}(${friendlyType} value) { + data.setType(idx(), MinorType.${name?upper_case}); + ArrowType arrowType = new ArrowType.Decimal(value.precision(), value.scale(), ${name}Vector.TYPE_WIDTH * 8); + get${name}Writer(arrowType).setPosition(idx()); + get${name}Writer(arrowType).write${name}(value); + } + + public void writeBigEndianBytesTo${name}(byte[] value, ArrowType arrowType) { + data.setType(idx(), MinorType.${name?upper_case}); + get${name}Writer(arrowType).setPosition(idx()); + get${name}Writer(arrowType).writeBigEndianBytesTo${name}(value, arrowType); + } + <#elseif minor.class?ends_with("VarBinary")> + @Override + public void write${minor.class}(byte[] value) { + get${name}Writer().setPosition(idx()); + get${name}Writer().write${minor.class}(value); + } + + @Override + public void write${minor.class}(byte[] value, int offset, int length) { + get${name}Writer().setPosition(idx()); + get${name}Writer().write${minor.class}(value, offset, length); + } + + @Override + public void write${minor.class}(ByteBuffer value) { + get${name}Writer().setPosition(idx()); + get${name}Writer().write${minor.class}(value); + } + + @Override + public void write${minor.class}(ByteBuffer value, int offset, int length) { + get${name}Writer().setPosition(idx()); + get${name}Writer().write${minor.class}(value, offset, length); + } + <#elseif minor.class?ends_with("VarChar")> + @Override + public void write${minor.class}(${friendlyType} value) { + get${name}Writer().setPosition(idx()); + get${name}Writer().write${minor.class}(value); + } + + @Override + public void write${minor.class}(String value) { + get${name}Writer().setPosition(idx()); + get${name}Writer().write${minor.class}(value); + } + + + + + + <#list vv.types as type><#list type.minor as minor> + <#assign lowerName = minor.class?uncap_first /> + <#if lowerName == "int" ><#assign lowerName = "integer" /> + <#assign upperName = minor.class?upper_case /> + <#assign capName = minor.class?cap_first /> + <#if !minor.typeParams?? || minor.class?starts_with("Decimal") || is_timestamp_tz(minor.class) || minor.class == "Duration" || minor.class == "FixedSizeBinary"> + + @Override + public ${capName}Writer ${lowerName}() { + data.setType(idx(), MinorType.LISTVIEW); + getListViewWriter().setPosition(idx()); + return getListViewWriter().${lowerName}(); + } + + +} diff --git a/java/vector/src/main/codegen/templates/UnionWriter.java b/java/vector/src/main/codegen/templates/UnionWriter.java index 08dbf24324b17..bfe97e2770553 100644 --- a/java/vector/src/main/codegen/templates/UnionWriter.java +++ b/java/vector/src/main/codegen/templates/UnionWriter.java @@ -42,12 +42,13 @@ @SuppressWarnings("unused") public class UnionWriter extends AbstractFieldWriter implements FieldWriter { - UnionVector data; - private StructWriter structWriter; - private UnionListWriter listWriter; - private UnionMapWriter mapWriter; - private List writers = new java.util.ArrayList<>(); - private final NullableStructWriterFactory nullableStructWriterFactory; + protected UnionVector data; + protected StructWriter structWriter; + protected UnionListWriter listWriter; + protected UnionListViewWriter listViewWriter; + protected UnionMapWriter mapWriter; + protected List writers = new java.util.ArrayList<>(); + protected final NullableStructWriterFactory nullableStructWriterFactory; public UnionWriter(UnionVector vector) { this(vector, NullableStructWriterFactory.getNullableStructWriterFactoryInstance()); @@ -58,6 +59,22 @@ public UnionWriter(UnionVector vector, NullableStructWriterFactory nullableStruc this.nullableStructWriterFactory = nullableStructWriterFactory; } + /** + * Convert the UnionWriter to a UnionViewWriter. + * + * @return the converted UnionViewWriter + */ + public UnionViewWriter toViewWriter() { + UnionViewWriter unionViewWriter = new UnionViewWriter(data, nullableStructWriterFactory); + unionViewWriter.structWriter = structWriter; + unionViewWriter.listWriter = listWriter; + unionViewWriter.listViewWriter = listViewWriter; + unionViewWriter.mapWriter = mapWriter; + unionViewWriter.writers = writers; + unionViewWriter.setPosition(this.getPosition()); + return unionViewWriter; + } + @Override public void setPosition(int index) { super.setPosition(index); @@ -89,6 +106,17 @@ public void endList() { getListWriter().endList(); } + @Override + public void startListView() { + getListViewWriter().startListView(); + data.setType(idx(), MinorType.LISTVIEW); + } + + @Override + public void endListView() { + getListViewWriter().endListView(); + } + @Override public void startMap() { getMapWriter().startMap(); @@ -134,7 +162,7 @@ public StructWriter asStruct() { return getStructWriter(); } - private ListWriter getListWriter() { + protected ListWriter getListWriter() { if (listWriter == null) { listWriter = new UnionListWriter(data.getList(), nullableStructWriterFactory); listWriter.setPosition(idx()); @@ -143,11 +171,25 @@ private ListWriter getListWriter() { return listWriter; } + protected ListWriter getListViewWriter() { + if (listViewWriter == null) { + listViewWriter = new UnionListViewWriter(data.getListView(), nullableStructWriterFactory); + listViewWriter.setPosition(idx()); + writers.add(listViewWriter); + } + return listViewWriter; + } + public ListWriter asList() { data.setType(idx(), MinorType.LIST); return getListWriter(); } + public ListWriter asListView() { + data.setType(idx(), MinorType.LISTVIEW); + return getListViewWriter(); + } + private MapWriter getMapWriter() { if (mapWriter == null) { mapWriter = new UnionMapWriter(data.getMap(new ArrowType.Map(false))); @@ -181,6 +223,8 @@ BaseWriter getWriter(MinorType minorType, ArrowType arrowType) { return getStructWriter(); case LIST: return getListWriter(); + case LISTVIEW: + return getListViewWriter(); case MAP: return getMapWriter(arrowType); <#list vv.types as type> @@ -367,6 +411,20 @@ public ListWriter list(String name) { return getStructWriter().list(name); } + @Override + public ListWriter listView() { + data.setType(idx(), MinorType.LISTVIEW); + getListViewWriter().setPosition(idx()); + return getListViewWriter().listView(); + } + + @Override + public ListWriter listView(String name) { + data.setType(idx(), MinorType.STRUCT); + getStructWriter().setPosition(idx()); + return getStructWriter().listView(name); + } + @Override public StructWriter struct(String name) { data.setType(idx(), MinorType.STRUCT); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractContainerVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractContainerVector.java index 0cefbe4004b82..a6a71cf1a4190 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractContainerVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractContainerVector.java @@ -25,6 +25,7 @@ import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.ArrowType.FixedSizeList; import org.apache.arrow.vector.types.pojo.ArrowType.List; +import org.apache.arrow.vector.types.pojo.ArrowType.ListView; import org.apache.arrow.vector.types.pojo.ArrowType.Struct; import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.util.CallBack; @@ -123,6 +124,10 @@ public ListVector addOrGetList(String name) { return addOrGet(name, FieldType.nullable(new List()), ListVector.class); } + public ListViewVector addOrGetListView(String name) { + return addOrGet(name, FieldType.nullable(new ListView()), ListViewVector.class); + } + public UnionVector addOrGetUnion(String name) { return addOrGet(name, FieldType.nullable(MinorType.UNION.getType()), UnionVector.class); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueViewVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueViewVector.java index 0040d12811258..031cc8037bb8b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueViewVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueViewVector.java @@ -101,7 +101,7 @@ private void allocateBuffers() { sizeBuffer = allocateBuffers(sizeAllocationSizeInBytes); } - private ArrowBuf allocateBuffers(final long size) { + protected ArrowBuf allocateBuffers(final long size) { final int curSize = (int) size; ArrowBuf buffer = allocator.buffer(curSize); buffer.readerIndex(0); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListViewVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListViewVector.java index 864d08a661cd2..d719c9b1a9a4e 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListViewVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListViewVector.java @@ -32,16 +32,20 @@ import org.apache.arrow.memory.util.ByteFunctionHelpers; import org.apache.arrow.memory.util.CommonUtil; import org.apache.arrow.memory.util.hash.ArrowBufHasher; +import org.apache.arrow.util.Preconditions; import org.apache.arrow.vector.AddOrGetResult; import org.apache.arrow.vector.BitVectorHelper; import org.apache.arrow.vector.BufferBacked; import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.ValueIterableVector; import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.ZeroVector; import org.apache.arrow.vector.compare.VectorVisitor; -import org.apache.arrow.vector.complex.impl.UnionListReader; +import org.apache.arrow.vector.complex.impl.ComplexCopier; +import org.apache.arrow.vector.complex.impl.UnionListViewReader; import org.apache.arrow.vector.complex.impl.UnionListViewWriter; import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.complex.writer.FieldWriter; import org.apache.arrow.vector.ipc.message.ArrowFieldNode; import org.apache.arrow.vector.types.Types.MinorType; import org.apache.arrow.vector.types.pojo.ArrowType; @@ -73,7 +77,7 @@ public class ListViewVector extends BaseRepeatedValueViewVector implements PromotableVector, ValueIterableVector> { protected ArrowBuf validityBuffer; - protected UnionListReader reader; + protected UnionListViewReader reader; private CallBack callBack; protected Field field; protected int validityAllocationSizeInBytes; @@ -245,7 +249,9 @@ public List getFieldBuffers() { */ @Override public void exportCDataBuffers(List buffers, ArrowBuf buffersPtr, long nullValue) { - throw new UnsupportedOperationException("exportCDataBuffers Not implemented yet"); + exportBuffer(validityBuffer, buffers, buffersPtr, nullValue, true); + exportBuffer(offsetBuffer, buffers, buffersPtr, nullValue, true); + exportBuffer(sizeBuffer, buffers, buffersPtr, nullValue, true); } @Override @@ -330,16 +336,22 @@ private long getNewAllocationSize(int currentBufferCapacity) { @Override public void copyFromSafe(int inIndex, int outIndex, ValueVector from) { - // TODO: https://github.com/apache/arrow/issues/41270 - throw new UnsupportedOperationException( - "ListViewVector does not support copyFromSafe operation yet."); + copyFrom(inIndex, outIndex, from); + } + + @Override + public OUT accept(VectorVisitor visitor, IN value) { + throw new UnsupportedOperationException("ListViewVector does not support visitor pattern."); } @Override public void copyFrom(int inIndex, int outIndex, ValueVector from) { - // TODO: https://github.com/apache/arrow/issues/41270 - throw new UnsupportedOperationException( - "ListViewVector does not support copyFrom operation yet."); + Preconditions.checkArgument(this.getMinorType() == from.getMinorType()); + FieldReader in = from.getReader(); + in.setPosition(inIndex); + FieldWriter out = getWriter(); + out.setPosition(outIndex); + ComplexCopier.copy(in, out); } @Override @@ -359,23 +371,17 @@ public TransferPair getTransferPair(Field field, BufferAllocator allocator) { @Override public TransferPair getTransferPair(String ref, BufferAllocator allocator, CallBack callBack) { - // TODO: https://github.com/apache/arrow/issues/41269 - throw new UnsupportedOperationException( - "ListVector does not support getTransferPair(String, BufferAllocator, CallBack) yet"); + return new TransferImpl(ref, allocator, callBack); } @Override public TransferPair getTransferPair(Field field, BufferAllocator allocator, CallBack callBack) { - // TODO: https://github.com/apache/arrow/issues/41269 - throw new UnsupportedOperationException( - "ListVector does not support getTransferPair(Field, BufferAllocator, CallBack) yet"); + return new TransferImpl(field, allocator, callBack); } @Override public TransferPair makeTransferPair(ValueVector target) { - // TODO: https://github.com/apache/arrow/issues/41269 - throw new UnsupportedOperationException( - "ListVector does not support makeTransferPair(ValueVector) yet"); + return new TransferImpl((ListViewVector) target); } @Override @@ -448,23 +454,172 @@ public int hashCode(int index, ArrowBufHasher hasher) { return hash; } - @Override - public OUT accept(VectorVisitor visitor, IN value) { - throw new UnsupportedOperationException(); + private class TransferImpl implements TransferPair { + + ListViewVector to; + TransferPair dataTransferPair; + + public TransferImpl(String name, BufferAllocator allocator, CallBack callBack) { + this(new ListViewVector(name, allocator, field.getFieldType(), callBack)); + } + + public TransferImpl(Field field, BufferAllocator allocator, CallBack callBack) { + this(new ListViewVector(field, allocator, callBack)); + } + + public TransferImpl(ListViewVector to) { + this.to = to; + to.addOrGetVector(vector.getField().getFieldType()); + if (to.getDataVector() instanceof ZeroVector) { + to.addOrGetVector(vector.getField().getFieldType()); + } + dataTransferPair = getDataVector().makeTransferPair(to.getDataVector()); + } + + @Override + public void transfer() { + to.clear(); + dataTransferPair.transfer(); + to.validityBuffer = transferBuffer(validityBuffer, to.allocator); + to.offsetBuffer = transferBuffer(offsetBuffer, to.allocator); + to.sizeBuffer = transferBuffer(sizeBuffer, to.allocator); + if (valueCount > 0) { + to.setValueCount(valueCount); + } + clear(); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + Preconditions.checkArgument( + startIndex >= 0 && length >= 0 && startIndex + length <= valueCount, + "Invalid parameters startIndex: %s, length: %s for valueCount: %s", + startIndex, + length, + valueCount); + to.clear(); + if (length > 0) { + final int startPoint = offsetBuffer.getInt((long) startIndex * OFFSET_WIDTH); + // we have to scan by index since there are out-of-order offsets + to.offsetBuffer = to.allocateBuffers((long) length * OFFSET_WIDTH); + to.sizeBuffer = to.allocateBuffers((long) length * SIZE_WIDTH); + + /* splitAndTransfer the size buffer */ + int maxOffsetAndSizeSum = -1; + int minOffsetValue = -1; + for (int i = 0; i < length; i++) { + final int offsetValue = offsetBuffer.getInt((long) (startIndex + i) * OFFSET_WIDTH); + final int sizeValue = sizeBuffer.getInt((long) (startIndex + i) * SIZE_WIDTH); + to.sizeBuffer.setInt((long) i * SIZE_WIDTH, sizeValue); + if (maxOffsetAndSizeSum < offsetValue + sizeValue) { + maxOffsetAndSizeSum = offsetValue + sizeValue; + } + if (minOffsetValue == -1 || minOffsetValue > offsetValue) { + minOffsetValue = offsetValue; + } + } + + /* splitAndTransfer the offset buffer */ + for (int i = 0; i < length; i++) { + final int offsetValue = offsetBuffer.getInt((long) (startIndex + i) * OFFSET_WIDTH); + final int relativeOffset = offsetValue - minOffsetValue; + to.offsetBuffer.setInt((long) i * OFFSET_WIDTH, relativeOffset); + } + + /* splitAndTransfer the validity buffer */ + splitAndTransferValidityBuffer(startIndex, length, to); + + /* splitAndTransfer the data buffer */ + final int childSliceLength = maxOffsetAndSizeSum - minOffsetValue; + dataTransferPair.splitAndTransfer(minOffsetValue, childSliceLength); + to.setValueCount(length); + } + } + + /* + * transfer the validity. + */ + private void splitAndTransferValidityBuffer(int startIndex, int length, ListViewVector target) { + int firstByteSource = BitVectorHelper.byteIndex(startIndex); + int lastByteSource = BitVectorHelper.byteIndex(valueCount - 1); + int byteSizeTarget = getValidityBufferSizeFromCount(length); + int offset = startIndex % 8; + + if (length > 0) { + if (offset == 0) { + // slice + if (target.validityBuffer != null) { + target.validityBuffer.getReferenceManager().release(); + } + target.validityBuffer = validityBuffer.slice(firstByteSource, byteSizeTarget); + target.validityBuffer.getReferenceManager().retain(1); + } else { + /* Copy data + * When the first bit starts from the middle of a byte (offset != 0), + * copy data from src BitVector. + * Each byte in the target is composed by a part in i-th byte, + * another part in (i+1)-th byte. + */ + target.allocateValidityBuffer(byteSizeTarget); + + for (int i = 0; i < byteSizeTarget - 1; i++) { + byte b1 = + BitVectorHelper.getBitsFromCurrentByte(validityBuffer, firstByteSource + i, offset); + byte b2 = + BitVectorHelper.getBitsFromNextByte( + validityBuffer, firstByteSource + i + 1, offset); + + target.validityBuffer.setByte(i, (b1 + b2)); + } + + /* Copying the last piece is done in the following manner: + * if the source vector has 1 or more bytes remaining, we copy + * the last piece as a byte formed by shifting data + * from the current byte and the next byte. + * + * if the source vector has no more bytes remaining + * (we are at the last byte), we copy the last piece as a byte + * by shifting data from the current byte. + */ + if ((firstByteSource + byteSizeTarget - 1) < lastByteSource) { + byte b1 = + BitVectorHelper.getBitsFromCurrentByte( + validityBuffer, firstByteSource + byteSizeTarget - 1, offset); + byte b2 = + BitVectorHelper.getBitsFromNextByte( + validityBuffer, firstByteSource + byteSizeTarget, offset); + + target.validityBuffer.setByte(byteSizeTarget - 1, b1 + b2); + } else { + byte b1 = + BitVectorHelper.getBitsFromCurrentByte( + validityBuffer, firstByteSource + byteSizeTarget - 1, offset); + target.validityBuffer.setByte(byteSizeTarget - 1, b1); + } + } + } + } + + @Override + public ValueVector getTo() { + return to; + } + + @Override + public void copyValueSafe(int from, int to) { + this.to.copyFrom(from, to, ListViewVector.this); + } } @Override protected FieldReader getReaderImpl() { - // TODO: https://github.com/apache/arrow/issues/41569 - throw new UnsupportedOperationException( - "ListViewVector does not support getReaderImpl operation yet."); + return new UnionListViewReader(this); } @Override - public UnionListReader getReader() { - // TODO: https://github.com/apache/arrow/issues/41569 - throw new UnsupportedOperationException( - "ListViewVector does not support getReader operation yet."); + public UnionListViewReader getReader() { + reader = (UnionListViewReader) super.getReader(); + return reader; } /** diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/ComplexWriterImpl.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/ComplexWriterImpl.java index 453f3ebb0c6e9..f3e48aa050e30 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/ComplexWriterImpl.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/ComplexWriterImpl.java @@ -18,6 +18,7 @@ import org.apache.arrow.util.Preconditions; import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.ListViewVector; import org.apache.arrow.vector.complex.MapVector; import org.apache.arrow.vector.complex.NonNullableStructVector; import org.apache.arrow.vector.complex.StateTool; @@ -30,6 +31,7 @@ public class ComplexWriterImpl extends AbstractFieldWriter implements ComplexWri private NullableStructWriter structRoot; private UnionListWriter listRoot; + private UnionListViewWriter listViewRoot; private UnionMapWriter mapRoot; private final NonNullableStructVector container; @@ -42,6 +44,7 @@ private enum Mode { INIT, STRUCT, LIST, + LISTVIEW, MAP } @@ -99,6 +102,9 @@ public void close() throws Exception { if (listRoot != null) { listRoot.close(); } + if (listViewRoot != null) { + listViewRoot.close(); + } } @Override @@ -110,6 +116,9 @@ public void clear() { case LIST: listRoot.clear(); break; + case LISTVIEW: + listViewRoot.clear(); + break; case MAP: mapRoot.clear(); break; @@ -127,6 +136,9 @@ public void setValueCount(int count) { case LIST: listRoot.setValueCount(count); break; + case LISTVIEW: + listViewRoot.setValueCount(count); + break; case MAP: mapRoot.setValueCount(count); break; @@ -145,6 +157,9 @@ public void setPosition(int index) { case LIST: listRoot.setPosition(index); break; + case LISTVIEW: + listViewRoot.setPosition(index); + break; case MAP: mapRoot.setPosition(index); break; @@ -232,6 +247,31 @@ public ListWriter rootAsList() { return listRoot; } + @Override + public ListWriter rootAsListView() { + switch (mode) { + case INIT: + int vectorCount = container.size(); + // TODO allow dictionaries in complex types + ListViewVector listVector = container.addOrGetListView(name); + if (container.size() > vectorCount) { + listVector.allocateNew(); + } + listViewRoot = new UnionListViewWriter(listVector, nullableStructWriterFactory); + listViewRoot.setPosition(idx()); + mode = Mode.LISTVIEW; + break; + + case LISTVIEW: + break; + + default: + check(Mode.INIT, Mode.STRUCT); + } + + return listViewRoot; + } + @Override public MapWriter rootAsMap(boolean keysSorted) { switch (mode) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListViewReader.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListViewReader.java new file mode 100644 index 0000000000000..17ac1150fd412 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListViewReader.java @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.complex.impl; + +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.complex.BaseRepeatedValueViewVector; +import org.apache.arrow.vector.complex.ListViewVector; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.UnionHolder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; + +/** {@link FieldReader} for listview of union types. */ +public class UnionListViewReader extends AbstractFieldReader { + + private final ListViewVector vector; + private final ValueVector data; + private int currentOffset; + private int size; + + /** + * Constructor for UnionListViewReader. + * + * @param vector the vector to read from + */ + public UnionListViewReader(ListViewVector vector) { + this.vector = vector; + this.data = vector.getDataVector(); + } + + @Override + public Field getField() { + return vector.getField(); + } + + @Override + public boolean isSet() { + return !vector.isNull(idx()); + } + + @Override + public void setPosition(int index) { + super.setPosition(index); + if (vector.getOffsetBuffer().capacity() == 0) { + currentOffset = 0; + size = 0; + } else { + currentOffset = + vector.getOffsetBuffer().getInt(index * (long) BaseRepeatedValueViewVector.OFFSET_WIDTH); + size = vector.getSizeBuffer().getInt(index * (long) BaseRepeatedValueViewVector.SIZE_WIDTH); + } + } + + @Override + public FieldReader reader() { + return data.getReader(); + } + + @Override + public Object readObject() { + return vector.getObject(idx()); + } + + @Override + public MinorType getMinorType() { + return MinorType.LISTVIEW; + } + + @Override + public void read(int index, UnionHolder holder) { + setPosition(idx()); + for (int i = -1; i < index; i++) { + next(); + } + holder.reader = data.getReader(); + holder.isSet = data.getReader().isSet() ? 1 : 0; + } + + @Override + public int size() { + return Math.max(size, 0); + } + + @Override + public boolean next() { + // Here, the currentOffSet keeps track of the current position in the vector inside the list at + // set position. + // And, size keeps track of the elements count in the list, so to make sure we traverse + // the full list, we need to check if the currentOffset is less than the currentOffset + size + if (currentOffset < currentOffset + size) { + data.getReader().setPosition(currentOffset++); + return true; + } else { + return false; + } + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java index 604f18b56b5c7..626619a9483de 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java @@ -22,6 +22,7 @@ import static com.fasterxml.jackson.core.JsonToken.START_OBJECT; import static org.apache.arrow.vector.BufferLayout.BufferType.DATA; import static org.apache.arrow.vector.BufferLayout.BufferType.OFFSET; +import static org.apache.arrow.vector.BufferLayout.BufferType.SIZE; import static org.apache.arrow.vector.BufferLayout.BufferType.TYPE; import static org.apache.arrow.vector.BufferLayout.BufferType.VALIDITY; import static org.apache.arrow.vector.BufferLayout.BufferType.VARIADIC_DATA_BUFFERS; @@ -72,6 +73,7 @@ import org.apache.arrow.vector.ipc.message.ArrowFieldNode; import org.apache.arrow.vector.types.Types.MinorType; import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.ArrowType.ListView; import org.apache.arrow.vector.types.pojo.ArrowType.Union; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.Schema; @@ -724,7 +726,7 @@ private List readIntoBuffer( if (bufferType.equals(VALIDITY)) { reader = helper.BIT; - } else if (bufferType.equals(OFFSET)) { + } else if (bufferType.equals(OFFSET) || bufferType.equals(SIZE)) { if (type == MinorType.LARGELIST || type == MinorType.LARGEVARCHAR || type == MinorType.LARGEVARBINARY) { @@ -888,8 +890,8 @@ private void readFromJsonIntoVector(Field field, FieldVector vector) throws IOEx BufferType bufferType = vectorTypes.get(v); nextFieldIs(bufferType.getName()); int innerBufferValueCount = valueCount; - if (bufferType.equals(OFFSET) && !(type instanceof Union)) { - /* offset buffer has 1 additional value capacity except for dense unions */ + if (bufferType.equals(OFFSET) && !(type instanceof Union) && !(type instanceof ListView)) { + /* offset buffer has 1 additional value capacity except for dense unions and ListView */ innerBufferValueCount = valueCount + 1; } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileWriter.java index d1ee890f5c596..929c8c97c0551 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileWriter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileWriter.java @@ -73,6 +73,7 @@ import org.apache.arrow.vector.UInt4Vector; import org.apache.arrow.vector.UInt8Vector; import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.complex.BaseRepeatedValueViewVector; import org.apache.arrow.vector.dictionary.Dictionary; import org.apache.arrow.vector.dictionary.DictionaryProvider; import org.apache.arrow.vector.types.Types.MinorType; @@ -229,7 +230,9 @@ private void writeFromVectorIntoJson(Field field, FieldVector vector) throws IOE // thus the values are only written to a single entity. generator.writeArrayFieldStart(bufferType.getName()); final int bufferValueCount = - (bufferType.equals(OFFSET) && vector.getMinorType() != MinorType.DENSEUNION) + (bufferType.equals(OFFSET) + && vector.getMinorType() != MinorType.DENSEUNION + && vector.getMinorType() != MinorType.LISTVIEW) ? valueCount + 1 : valueCount; for (int i = 0; i < bufferValueCount; i++) { @@ -259,6 +262,7 @@ private void writeFromVectorIntoJson(Field field, FieldVector vector) throws IOE } else if (bufferType.equals(OFFSET) && vector.getValueCount() == 0 && (vector.getMinorType() == MinorType.LIST + || vector.getMinorType() == MinorType.LISTVIEW || vector.getMinorType() == MinorType.MAP || vector.getMinorType() == MinorType.VARBINARY || vector.getMinorType() == MinorType.VARCHAR)) { @@ -419,6 +423,10 @@ private void writeValueToGenerator( case MAP: generator.writeNumber(buffer.getInt((long) index * BaseVariableWidthVector.OFFSET_WIDTH)); break; + case LISTVIEW: + generator.writeNumber( + buffer.getInt((long) index * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + break; case LARGELIST: case LARGEVARBINARY: case LARGEVARCHAR: @@ -573,6 +581,8 @@ private void writeValueToGenerator( default: throw new UnsupportedOperationException("minor type: " + vector.getMinorType()); } + } else if (bufferType.equals(SIZE)) { + generator.writeNumber(buffer.getInt((long) index * BaseRepeatedValueViewVector.SIZE_WIDTH)); } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestListViewVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestListViewVector.java index 1a58b65e3be4a..4fa808c18aece 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestListViewVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestListViewVector.java @@ -32,7 +32,6 @@ import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.ListViewVector; import org.apache.arrow.vector.complex.impl.UnionListViewWriter; -import org.apache.arrow.vector.complex.impl.UnionListWriter; import org.apache.arrow.vector.holders.DurationHolder; import org.apache.arrow.vector.holders.TimeStampMilliTZHolder; import org.apache.arrow.vector.types.TimeUnit; @@ -40,6 +39,7 @@ import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -68,40 +68,40 @@ public void testBasicListViewVector() { /* write the first list at index 0 */ listViewWriter.setPosition(0); - listViewWriter.startList(); + listViewWriter.startListView(); listViewWriter.bigInt().writeBigInt(12); listViewWriter.bigInt().writeBigInt(-7); listViewWriter.bigInt().writeBigInt(25); - listViewWriter.endList(); + listViewWriter.endListView(); /* the second list at index 1 is null (we are not setting any)*/ /* write the third list at index 2 */ listViewWriter.setPosition(2); - listViewWriter.startList(); + listViewWriter.startListView(); listViewWriter.bigInt().writeBigInt(0); listViewWriter.bigInt().writeBigInt(-127); listViewWriter.bigInt().writeBigInt(127); listViewWriter.bigInt().writeBigInt(50); - listViewWriter.endList(); + listViewWriter.endListView(); /* write the fourth list at index 3 (empty list) */ listViewWriter.setPosition(3); - listViewWriter.startList(); - listViewWriter.endList(); + listViewWriter.startListView(); + listViewWriter.endListView(); /* write the fifth list at index 4 */ listViewWriter.setPosition(4); - listViewWriter.startList(); + listViewWriter.startListView(); listViewWriter.bigInt().writeBigInt(1); listViewWriter.bigInt().writeBigInt(2); listViewWriter.bigInt().writeBigInt(3); listViewWriter.bigInt().writeBigInt(4); - listViewWriter.endList(); + listViewWriter.endListView(); - listViewVector.setValueCount(5); + listViewWriter.setValueCount(5); // check value count assertEquals(5, listViewVector.getValueCount()); @@ -158,7 +158,7 @@ public void testImplicitNullVectors() { listViewWriter.bigInt().writeBigInt(12); listViewWriter.bigInt().writeBigInt(-7); listViewWriter.bigInt().writeBigInt(25); - listViewWriter.endList(); + listViewWriter.endListView(); int offSet0 = offSetBuffer.getInt(0 * BaseRepeatedValueViewVector.OFFSET_WIDTH); int size0 = sizeBuffer.getInt(0 * BaseRepeatedValueViewVector.SIZE_WIDTH); @@ -172,7 +172,7 @@ public void testImplicitNullVectors() { assertEquals(3, size0); listViewWriter.setPosition(5); - listViewWriter.startList(); + listViewWriter.startListView(); // writing the 6th list at index 5, // and the list items from index 1 through 4 are not populated. @@ -192,7 +192,7 @@ public void testImplicitNullVectors() { listViewWriter.bigInt().writeBigInt(12); listViewWriter.bigInt().writeBigInt(25); - listViewWriter.endList(); + listViewWriter.endListView(); int offSet5 = offSetBuffer.getInt(5 * BaseRepeatedValueViewVector.OFFSET_WIDTH); int size5 = sizeBuffer.getInt(5 * BaseRepeatedValueViewVector.SIZE_WIDTH); @@ -201,7 +201,7 @@ public void testImplicitNullVectors() { assertEquals(2, size5); listViewWriter.setPosition(10); - listViewWriter.startList(); + listViewWriter.startListView(); // writing the 11th list at index 10, // and the list items from index 6 through 10 are not populated. @@ -219,7 +219,7 @@ public void testImplicitNullVectors() { } listViewWriter.bigInt().writeBigInt(12); - listViewWriter.endList(); + listViewWriter.endListView(); int offSet11 = offSetBuffer.getInt(10 * BaseRepeatedValueViewVector.OFFSET_WIDTH); int size11 = sizeBuffer.getInt(10 * BaseRepeatedValueViewVector.SIZE_WIDTH); @@ -247,43 +247,43 @@ public void testNestedListViewVector() { /* write one or more inner lists at index 0 */ listViewWriter.setPosition(0); - listViewWriter.startList(); + listViewWriter.startListView(); - listViewWriter.list().startList(); - listViewWriter.list().bigInt().writeBigInt(50); - listViewWriter.list().bigInt().writeBigInt(100); - listViewWriter.list().bigInt().writeBigInt(200); - listViewWriter.list().endList(); + listViewWriter.listView().startListView(); + listViewWriter.listView().bigInt().writeBigInt(50); + listViewWriter.listView().bigInt().writeBigInt(100); + listViewWriter.listView().bigInt().writeBigInt(200); + listViewWriter.listView().endListView(); - listViewWriter.list().startList(); - listViewWriter.list().bigInt().writeBigInt(75); - listViewWriter.list().bigInt().writeBigInt(125); - listViewWriter.list().bigInt().writeBigInt(150); - listViewWriter.list().bigInt().writeBigInt(175); - listViewWriter.list().endList(); + listViewWriter.listView().startListView(); + listViewWriter.listView().bigInt().writeBigInt(75); + listViewWriter.listView().bigInt().writeBigInt(125); + listViewWriter.listView().bigInt().writeBigInt(150); + listViewWriter.listView().bigInt().writeBigInt(175); + listViewWriter.listView().endListView(); - listViewWriter.endList(); + listViewWriter.endListView(); /* write one or more inner lists at index 1 */ listViewWriter.setPosition(1); - listViewWriter.startList(); + listViewWriter.startListView(); - listViewWriter.list().startList(); - listViewWriter.list().bigInt().writeBigInt(10); - listViewWriter.list().endList(); + listViewWriter.listView().startListView(); + listViewWriter.listView().bigInt().writeBigInt(10); + listViewWriter.listView().endListView(); - listViewWriter.list().startList(); - listViewWriter.list().bigInt().writeBigInt(15); - listViewWriter.list().bigInt().writeBigInt(20); - listViewWriter.list().endList(); + listViewWriter.listView().startListView(); + listViewWriter.listView().bigInt().writeBigInt(15); + listViewWriter.listView().bigInt().writeBigInt(20); + listViewWriter.listView().endListView(); - listViewWriter.list().startList(); - listViewWriter.list().bigInt().writeBigInt(25); - listViewWriter.list().bigInt().writeBigInt(30); - listViewWriter.list().bigInt().writeBigInt(35); - listViewWriter.list().endList(); + listViewWriter.listView().startListView(); + listViewWriter.listView().bigInt().writeBigInt(25); + listViewWriter.listView().bigInt().writeBigInt(30); + listViewWriter.listView().bigInt().writeBigInt(35); + listViewWriter.listView().endListView(); - listViewWriter.endList(); + listViewWriter.endListView(); listViewVector.setValueCount(2); @@ -392,8 +392,8 @@ private void setValuesInBuffer(int[] bufValues, ArrowBuf buffer, long bufWidth) /* * Setting up the buffers directly needs to be validated with the base method used in - * the ListVector class where we use the approach of startList(), - * write to the child vector and endList(). + * the ListVector class where we use the approach of startListView(), + * write to the child vector and endListView(). *

* To support this, we have to consider the following scenarios; *

@@ -499,7 +499,7 @@ public void testBasicListViewSetNested() { listViewVector.allocateNew(); // Initialize the child vector using `initializeChildrenFromFields` method. - FieldType fieldType = new FieldType(true, new ArrowType.List(), null, null); + FieldType fieldType = new FieldType(true, new ArrowType.ListView(), null, null); FieldType childFieldType = new FieldType(true, new ArrowType.Int(64, true), null, null); Field childField = new Field("child-vector", childFieldType, null); List children = new ArrayList<>(); @@ -511,52 +511,52 @@ public void testBasicListViewSetNested() { FieldVector fieldVector = listViewVector.getDataVector(); fieldVector.clear(); - ListVector childVector = (ListVector) fieldVector; - UnionListWriter listWriter = childVector.getWriter(); - listWriter.allocate(); + ListViewVector childVector = (ListViewVector) fieldVector; + UnionListViewWriter listViewWriter = childVector.getWriter(); + listViewWriter.allocate(); - listWriter.setPosition(0); - listWriter.startList(); + listViewWriter.setPosition(0); + listViewWriter.startListView(); - listWriter.bigInt().writeBigInt(50); - listWriter.bigInt().writeBigInt(100); - listWriter.bigInt().writeBigInt(200); + listViewWriter.bigInt().writeBigInt(50); + listViewWriter.bigInt().writeBigInt(100); + listViewWriter.bigInt().writeBigInt(200); - listWriter.endList(); + listViewWriter.endListView(); - listWriter.setPosition(1); - listWriter.startList(); + listViewWriter.setPosition(1); + listViewWriter.startListView(); - listWriter.bigInt().writeBigInt(75); - listWriter.bigInt().writeBigInt(125); - listWriter.bigInt().writeBigInt(150); - listWriter.bigInt().writeBigInt(175); + listViewWriter.bigInt().writeBigInt(75); + listViewWriter.bigInt().writeBigInt(125); + listViewWriter.bigInt().writeBigInt(150); + listViewWriter.bigInt().writeBigInt(175); - listWriter.endList(); + listViewWriter.endListView(); - listWriter.setPosition(2); - listWriter.startList(); + listViewWriter.setPosition(2); + listViewWriter.startListView(); - listWriter.bigInt().writeBigInt(10); + listViewWriter.bigInt().writeBigInt(10); - listWriter.endList(); + listViewWriter.endListView(); - listWriter.startList(); - listWriter.setPosition(3); + listViewWriter.startListView(); + listViewWriter.setPosition(3); - listWriter.bigInt().writeBigInt(15); - listWriter.bigInt().writeBigInt(20); + listViewWriter.bigInt().writeBigInt(15); + listViewWriter.bigInt().writeBigInt(20); - listWriter.endList(); + listViewWriter.endListView(); - listWriter.startList(); - listWriter.setPosition(4); + listViewWriter.startListView(); + listViewWriter.setPosition(4); - listWriter.bigInt().writeBigInt(25); - listWriter.bigInt().writeBigInt(30); - listWriter.bigInt().writeBigInt(35); + listViewWriter.bigInt().writeBigInt(25); + listViewWriter.bigInt().writeBigInt(30); + listViewWriter.bigInt().writeBigInt(35); - listWriter.endList(); + listViewWriter.endListView(); childVector.setValueCount(5); @@ -713,12 +713,12 @@ public void testBasicListViewSetWithListViewWriter() { UnionListViewWriter listViewWriter = listViewVector.getWriter(); listViewWriter.setPosition(4); - listViewWriter.startList(); + listViewWriter.startListView(); listViewWriter.bigInt().writeBigInt(121); listViewWriter.bigInt().writeBigInt(-71); listViewWriter.bigInt().writeBigInt(251); - listViewWriter.endList(); + listViewWriter.endListView(); listViewVector.setValueCount(5); @@ -762,17 +762,17 @@ public void testGetBufferAddress() throws Exception { listViewWriter.allocate(); listViewWriter.setPosition(0); - listViewWriter.startList(); + listViewWriter.startListView(); listViewWriter.bigInt().writeBigInt(50); listViewWriter.bigInt().writeBigInt(100); listViewWriter.bigInt().writeBigInt(200); - listViewWriter.endList(); + listViewWriter.endListView(); listViewWriter.setPosition(1); - listViewWriter.startList(); + listViewWriter.startListView(); listViewWriter.bigInt().writeBigInt(250); listViewWriter.bigInt().writeBigInt(300); - listViewWriter.endList(); + listViewWriter.endListView(); listViewVector.setValueCount(2); @@ -919,10 +919,10 @@ public void testWriterGetField() { writer.allocate(); // set some values - writer.startList(); + writer.startListView(); writer.integer().writeInt(1); writer.integer().writeInt(2); - writer.endList(); + writer.endListView(); vector.setValueCount(2); Field expectedDataField = @@ -951,7 +951,7 @@ public void testWriterUsingHolderGetTimestampMilliTZField() { TimeStampMilliTZHolder holder = new TimeStampMilliTZHolder(); holder.timezone = "SomeFakeTimeZone"; - writer.startList(); + writer.startListView(); holder.value = 12341234L; writer.timeStampMilliTZ().write(holder); holder.value = 55555L; @@ -967,7 +967,7 @@ public void testWriterUsingHolderGetTimestampMilliTZField() { "holder.timezone: AsdfTimeZone not equal to vector timezone: SomeFakeTimeZone", ex.getMessage()); - writer.endList(); + writer.endListView(); vector.setValueCount(1); Field expectedDataField = @@ -997,7 +997,7 @@ public void testWriterGetDurationField() { DurationHolder durationHolder = new DurationHolder(); durationHolder.unit = TimeUnit.MILLISECOND; - writer.startList(); + writer.startListView(); durationHolder.value = 812374L; writer.duration().write(durationHolder); durationHolder.value = 143451L; @@ -1011,7 +1011,7 @@ public void testWriterGetDurationField() { IllegalArgumentException.class, () -> writer.duration().write(durationHolder)); assertEquals("holder.unit: SECOND not equal to vector unit: MILLISECOND", ex.getMessage()); - writer.endList(); + writer.endListView(); vector.setValueCount(1); Field expectedDataField = @@ -1039,10 +1039,10 @@ public void testClose() throws Exception { writer.allocate(); // set some values - writer.startList(); + writer.startListView(); writer.integer().writeInt(1); writer.integer().writeInt(2); - writer.endList(); + writer.endListView(); vector.setValueCount(2); assertTrue(vector.getBufferSize() > 0); @@ -1144,27 +1144,27 @@ public void testSetNull1() { writer.allocate(); writer.setPosition(0); - writer.startList(); + writer.startListView(); writer.bigInt().writeBigInt(10); writer.bigInt().writeBigInt(20); - writer.endList(); + writer.endListView(); vector.setNull(1); writer.setPosition(2); - writer.startList(); + writer.startListView(); writer.bigInt().writeBigInt(30); writer.bigInt().writeBigInt(40); - writer.endList(); + writer.endListView(); vector.setNull(3); vector.setNull(4); writer.setPosition(5); - writer.startList(); + writer.startListView(); writer.bigInt().writeBigInt(50); writer.bigInt().writeBigInt(60); - writer.endList(); + writer.endListView(); vector.setValueCount(6); @@ -1238,24 +1238,24 @@ public void testSetNull2() { vector.setNull(4); writer.setPosition(1); - writer.startList(); + writer.startListView(); writer.bigInt().writeBigInt(10); writer.bigInt().writeBigInt(20); writer.bigInt().writeBigInt(30); - writer.endList(); + writer.endListView(); writer.setPosition(3); - writer.startList(); + writer.startListView(); writer.bigInt().writeBigInt(40); writer.bigInt().writeBigInt(50); - writer.endList(); + writer.endListView(); writer.setPosition(5); - writer.startList(); + writer.startListView(); writer.bigInt().writeBigInt(60); writer.bigInt().writeBigInt(70); writer.bigInt().writeBigInt(80); - writer.endList(); + writer.endListView(); vector.setValueCount(6); @@ -1327,24 +1327,24 @@ public void testSetNull3() { writer.allocate(); writer.setPosition(1); - writer.startList(); + writer.startListView(); writer.bigInt().writeBigInt(10); writer.bigInt().writeBigInt(20); writer.bigInt().writeBigInt(30); - writer.endList(); + writer.endListView(); writer.setPosition(3); - writer.startList(); + writer.startListView(); writer.bigInt().writeBigInt(40); writer.bigInt().writeBigInt(50); - writer.endList(); + writer.endListView(); writer.setPosition(5); - writer.startList(); + writer.startListView(); writer.bigInt().writeBigInt(60); writer.bigInt().writeBigInt(70); writer.bigInt().writeBigInt(80); - writer.endList(); + writer.endListView(); vector.setNull(0); vector.setNull(2); @@ -1419,31 +1419,31 @@ public void testOverWrite1() { writer.allocate(); writer.setPosition(0); - writer.startList(); + writer.startListView(); writer.bigInt().writeBigInt(10); writer.bigInt().writeBigInt(20); writer.bigInt().writeBigInt(30); - writer.endList(); + writer.endListView(); writer.setPosition(1); - writer.startList(); + writer.startListView(); writer.bigInt().writeBigInt(40); writer.bigInt().writeBigInt(50); - writer.endList(); + writer.endListView(); vector.setValueCount(2); writer.setPosition(0); - writer.startList(); + writer.startListView(); writer.bigInt().writeBigInt(60); writer.bigInt().writeBigInt(70); - writer.endList(); + writer.endListView(); writer.setPosition(1); - writer.startList(); + writer.startListView(); writer.bigInt().writeBigInt(80); writer.bigInt().writeBigInt(90); - writer.endList(); + writer.endListView(); vector.setValueCount(2); @@ -1473,17 +1473,17 @@ public void testOverwriteWithNull() { ArrowBuf sizeBuffer = vector.getSizeBuffer(); writer.setPosition(0); - writer.startList(); + writer.startListView(); writer.bigInt().writeBigInt(10); writer.bigInt().writeBigInt(20); writer.bigInt().writeBigInt(30); - writer.endList(); + writer.endListView(); writer.setPosition(1); - writer.startList(); + writer.startListView(); writer.bigInt().writeBigInt(40); writer.bigInt().writeBigInt(50); - writer.endList(); + writer.endListView(); vector.setValueCount(2); @@ -1507,19 +1507,19 @@ public void testOverwriteWithNull() { assertTrue(vector.isNull(1)); writer.setPosition(0); - writer.startList(); + writer.startListView(); writer.bigInt().writeBigInt(60); writer.bigInt().writeBigInt(70); - writer.endList(); + writer.endListView(); assertEquals(0, offsetBuffer.getInt(0 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); assertEquals(2, sizeBuffer.getInt(0 * BaseRepeatedValueViewVector.SIZE_WIDTH)); writer.setPosition(1); - writer.startList(); + writer.startListView(); writer.bigInt().writeBigInt(80); writer.bigInt().writeBigInt(90); - writer.endList(); + writer.endListView(); assertEquals(2, offsetBuffer.getInt(1 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); assertEquals(2, sizeBuffer.getInt(1 * BaseRepeatedValueViewVector.SIZE_WIDTH)); @@ -1655,11 +1655,440 @@ public void testOutOfOrderOffset1() { } } + private int validateSizeBufferAndCalculateMinOffset( + int start, + int splitLength, + ArrowBuf fromOffsetBuffer, + ArrowBuf fromSizeBuffer, + ArrowBuf toSizeBuffer) { + int minOffset = fromOffsetBuffer.getInt((long) start * ListViewVector.OFFSET_WIDTH); + int fromDataLength; + int toDataLength; + + for (int i = 0; i < splitLength; i++) { + fromDataLength = fromSizeBuffer.getInt((long) (start + i) * ListViewVector.SIZE_WIDTH); + toDataLength = toSizeBuffer.getInt((long) (i) * ListViewVector.SIZE_WIDTH); + + /* validate size */ + assertEquals( + fromDataLength, + toDataLength, + "Different data lengths at index: " + i + " and start: " + start); + + /* calculate minimum offset */ + int currentOffset = fromOffsetBuffer.getInt((long) (start + i) * ListViewVector.OFFSET_WIDTH); + if (currentOffset < minOffset) { + minOffset = currentOffset; + } + } + + return minOffset; + } + + private void validateOffsetBuffer( + int start, + int splitLength, + ArrowBuf fromOffsetBuffer, + ArrowBuf toOffsetBuffer, + int minOffset) { + int offset1; + int offset2; + + for (int i = 0; i < splitLength; i++) { + offset1 = fromOffsetBuffer.getInt((long) (start + i) * ListViewVector.OFFSET_WIDTH); + offset2 = toOffsetBuffer.getInt((long) (i) * ListViewVector.OFFSET_WIDTH); + assertEquals( + offset1 - minOffset, + offset2, + "Different offset values at index: " + i + " and start: " + start); + } + } + + private void validateDataBuffer( + int start, + int splitLength, + ArrowBuf fromOffsetBuffer, + ArrowBuf fromSizeBuffer, + BigIntVector fromDataVector, + ArrowBuf toOffsetBuffer, + BigIntVector toDataVector) { + int dataLength; + Long fromValue; + for (int i = 0; i < splitLength; i++) { + dataLength = fromSizeBuffer.getInt((long) (start + i) * ListViewVector.SIZE_WIDTH); + for (int j = 0; j < dataLength; j++) { + fromValue = + fromDataVector.getObject( + (fromOffsetBuffer.getInt((long) (start + i) * ListViewVector.OFFSET_WIDTH) + j)); + Long toValue = + toDataVector.getObject( + (toOffsetBuffer.getInt((long) i * ListViewVector.OFFSET_WIDTH) + j)); + assertEquals( + fromValue, toValue, "Different data values at index: " + i + " and start: " + start); + } + } + } + + /** + * Validate split and transfer of data from fromVector to toVector. Note that this method assumes + * that the child vector is BigIntVector. + * + * @param start start index + * @param splitLength length of data to split and transfer + * @param fromVector fromVector + * @param toVector toVector + */ + private void validateSplitAndTransfer( + TransferPair transferPair, + int start, + int splitLength, + ListViewVector fromVector, + ListViewVector toVector) { + + transferPair.splitAndTransfer(start, splitLength); + + /* get offsetBuffer of toVector */ + final ArrowBuf toOffsetBuffer = toVector.getOffsetBuffer(); + + /* get sizeBuffer of toVector */ + final ArrowBuf toSizeBuffer = toVector.getSizeBuffer(); + + /* get dataVector of toVector */ + BigIntVector toDataVector = (BigIntVector) toVector.getDataVector(); + + /* get offsetBuffer of toVector */ + final ArrowBuf fromOffsetBuffer = fromVector.getOffsetBuffer(); + + /* get sizeBuffer of toVector */ + final ArrowBuf fromSizeBuffer = fromVector.getSizeBuffer(); + + /* get dataVector of toVector */ + BigIntVector fromDataVector = (BigIntVector) fromVector.getDataVector(); + + /* validate size buffers */ + int minOffset = + validateSizeBufferAndCalculateMinOffset( + start, splitLength, fromOffsetBuffer, fromSizeBuffer, toSizeBuffer); + /* validate offset buffers */ + validateOffsetBuffer(start, splitLength, fromOffsetBuffer, toOffsetBuffer, minOffset); + /* validate data */ + validateDataBuffer( + start, + splitLength, + fromOffsetBuffer, + fromSizeBuffer, + fromDataVector, + toOffsetBuffer, + toDataVector); + } + + @Test + public void testSplitAndTransfer() throws Exception { + try (ListViewVector fromVector = ListViewVector.empty("sourceVector", allocator)) { + + /* Explicitly add the dataVector */ + MinorType type = MinorType.BIGINT; + fromVector.addOrGetVector(FieldType.nullable(type.getType())); + + UnionListViewWriter listViewWriter = fromVector.getWriter(); + + /* allocate memory */ + listViewWriter.allocate(); + + /* populate data */ + listViewWriter.setPosition(0); + listViewWriter.startListView(); + listViewWriter.bigInt().writeBigInt(10); + listViewWriter.bigInt().writeBigInt(11); + listViewWriter.bigInt().writeBigInt(12); + listViewWriter.endListView(); + + listViewWriter.setPosition(1); + listViewWriter.startListView(); + listViewWriter.bigInt().writeBigInt(13); + listViewWriter.bigInt().writeBigInt(14); + listViewWriter.endListView(); + + listViewWriter.setPosition(2); + listViewWriter.startListView(); + listViewWriter.bigInt().writeBigInt(15); + listViewWriter.bigInt().writeBigInt(16); + listViewWriter.bigInt().writeBigInt(17); + listViewWriter.bigInt().writeBigInt(18); + listViewWriter.endListView(); + + listViewWriter.setPosition(3); + listViewWriter.startListView(); + listViewWriter.bigInt().writeBigInt(19); + listViewWriter.endListView(); + + listViewWriter.setPosition(4); + listViewWriter.startListView(); + listViewWriter.bigInt().writeBigInt(20); + listViewWriter.bigInt().writeBigInt(21); + listViewWriter.bigInt().writeBigInt(22); + listViewWriter.bigInt().writeBigInt(23); + listViewWriter.endListView(); + + fromVector.setValueCount(5); + + /* get offset buffer */ + final ArrowBuf offsetBuffer = fromVector.getOffsetBuffer(); + + /* get size buffer */ + final ArrowBuf sizeBuffer = fromVector.getSizeBuffer(); + + /* get dataVector */ + BigIntVector dataVector = (BigIntVector) fromVector.getDataVector(); + + /* check the vector output */ + + int index = 0; + int offset; + int size = 0; + Long actual; + + /* index 0 */ + assertFalse(fromVector.isNull(index)); + offset = offsetBuffer.getInt(index * ListViewVector.OFFSET_WIDTH); + assertEquals(Integer.toString(0), Integer.toString(offset)); + + actual = dataVector.getObject(offset); + assertEquals(Long.valueOf(10), actual); + offset++; + actual = dataVector.getObject(offset); + assertEquals(Long.valueOf(11), actual); + offset++; + actual = dataVector.getObject(offset); + assertEquals(Long.valueOf(12), actual); + assertEquals( + Integer.toString(3), + Integer.toString(sizeBuffer.getInt(index * ListViewVector.SIZE_WIDTH))); + + /* index 1 */ + index++; + assertFalse(fromVector.isNull(index)); + offset = offsetBuffer.getInt(index * ListViewVector.OFFSET_WIDTH); + assertEquals(Integer.toString(3), Integer.toString(offset)); + + actual = dataVector.getObject(offset); + assertEquals(Long.valueOf(13), actual); + offset++; + size++; + actual = dataVector.getObject(offset); + assertEquals(Long.valueOf(14), actual); + size++; + assertEquals( + Integer.toString(size), + Integer.toString(sizeBuffer.getInt(index * ListViewVector.SIZE_WIDTH))); + + /* index 2 */ + size = 0; + index++; + assertFalse(fromVector.isNull(index)); + offset = offsetBuffer.getInt(index * ListViewVector.OFFSET_WIDTH); + assertEquals(Integer.toString(5), Integer.toString(offset)); + size++; + + actual = dataVector.getObject(offset); + assertEquals(Long.valueOf(15), actual); + offset++; + size++; + actual = dataVector.getObject(offset); + assertEquals(Long.valueOf(16), actual); + offset++; + size++; + actual = dataVector.getObject(offset); + assertEquals(Long.valueOf(17), actual); + offset++; + size++; + actual = dataVector.getObject(offset); + assertEquals(Long.valueOf(18), actual); + assertEquals( + Integer.toString(size), + Integer.toString(sizeBuffer.getInt(index * ListViewVector.SIZE_WIDTH))); + + /* index 3 */ + size = 0; + index++; + assertFalse(fromVector.isNull(index)); + offset = offsetBuffer.getInt(index * ListViewVector.OFFSET_WIDTH); + assertEquals(Integer.toString(9), Integer.toString(offset)); + + actual = dataVector.getObject(offset); + assertEquals(Long.valueOf(19), actual); + size++; + assertEquals( + Integer.toString(size), + Integer.toString(sizeBuffer.getInt(index * ListViewVector.SIZE_WIDTH))); + + /* index 4 */ + size = 0; + index++; + assertFalse(fromVector.isNull(index)); + offset = offsetBuffer.getInt(index * ListViewVector.OFFSET_WIDTH); + assertEquals(Integer.toString(10), Integer.toString(offset)); + + actual = dataVector.getObject(offset); + assertEquals(Long.valueOf(20), actual); + offset++; + size++; + actual = dataVector.getObject(offset); + assertEquals(Long.valueOf(21), actual); + offset++; + size++; + actual = dataVector.getObject(offset); + assertEquals(Long.valueOf(22), actual); + offset++; + size++; + actual = dataVector.getObject(offset); + assertEquals(Long.valueOf(23), actual); + size++; + assertEquals( + Integer.toString(size), + Integer.toString(sizeBuffer.getInt(index * ListViewVector.SIZE_WIDTH))); + + /* do split and transfer */ + try (ListViewVector toVector = ListViewVector.empty("toVector", allocator)) { + int[][] transferLengths = {{0, 2}, {3, 1}, {4, 1}}; + TransferPair transferPair = fromVector.makeTransferPair(toVector); + + for (final int[] transferLength : transferLengths) { + int start = transferLength[0]; + int splitLength = transferLength[1]; + validateSplitAndTransfer(transferPair, start, splitLength, fromVector, toVector); + } + } + } + } + + @Test + public void testOutOfOrderOffsetSplitAndTransfer() { + // [[12, -7, 25], null, [0, -127, 127, 50], [], [50, 12]] + try (ListViewVector fromVector = ListViewVector.empty("fromVector", allocator)) { + // Allocate buffers in listViewVector by calling `allocateNew` method. + fromVector.allocateNew(); + + // Initialize the child vector using `initializeChildrenFromFields` method. + + FieldType fieldType = new FieldType(true, new ArrowType.Int(64, true), null, null); + Field field = new Field("child-vector", fieldType, null); + fromVector.initializeChildrenFromFields(Collections.singletonList(field)); + + // Set values in the child vector. + FieldVector fieldVector = fromVector.getDataVector(); + fieldVector.clear(); + + BigIntVector childVector = (BigIntVector) fieldVector; + + childVector.allocateNew(7); + + childVector.set(0, 0); + childVector.set(1, -127); + childVector.set(2, 127); + childVector.set(3, 50); + childVector.set(4, 12); + childVector.set(5, -7); + childVector.set(6, 25); + + childVector.setValueCount(7); + + // Set validity, offset and size buffers using `setValidity`, + // `setOffset` and `setSize` methods. + fromVector.setValidity(0, 1); + fromVector.setValidity(1, 0); + fromVector.setValidity(2, 1); + fromVector.setValidity(3, 1); + fromVector.setValidity(4, 1); + + fromVector.setOffset(0, 4); + fromVector.setOffset(1, 7); + fromVector.setOffset(2, 0); + fromVector.setOffset(3, 0); + fromVector.setOffset(4, 3); + + fromVector.setSize(0, 3); + fromVector.setSize(1, 0); + fromVector.setSize(2, 4); + fromVector.setSize(3, 0); + fromVector.setSize(4, 2); + + // Set value count using `setValueCount` method. + fromVector.setValueCount(5); + + final ArrowBuf offSetBuffer = fromVector.getOffsetBuffer(); + final ArrowBuf sizeBuffer = fromVector.getSizeBuffer(); + + // check offset buffer + assertEquals(4, offSetBuffer.getInt(0 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(7, offSetBuffer.getInt(1 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(0, offSetBuffer.getInt(2 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(0, offSetBuffer.getInt(3 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(3, offSetBuffer.getInt(4 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + + // check size buffer + assertEquals(3, sizeBuffer.getInt(0 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(0, sizeBuffer.getInt(1 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(4, sizeBuffer.getInt(2 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(0, sizeBuffer.getInt(3 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(2, sizeBuffer.getInt(4 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + + // check child vector + assertEquals(0, ((BigIntVector) fromVector.getDataVector()).get(0)); + assertEquals(-127, ((BigIntVector) fromVector.getDataVector()).get(1)); + assertEquals(127, ((BigIntVector) fromVector.getDataVector()).get(2)); + assertEquals(50, ((BigIntVector) fromVector.getDataVector()).get(3)); + assertEquals(12, ((BigIntVector) fromVector.getDataVector()).get(4)); + assertEquals(-7, ((BigIntVector) fromVector.getDataVector()).get(5)); + assertEquals(25, ((BigIntVector) fromVector.getDataVector()).get(6)); + + // check values + Object result = fromVector.getObject(0); + ArrayList resultSet = (ArrayList) result; + assertEquals(3, resultSet.size()); + assertEquals(Long.valueOf(12), resultSet.get(0)); + assertEquals(Long.valueOf(-7), resultSet.get(1)); + assertEquals(Long.valueOf(25), resultSet.get(2)); + + assertTrue(fromVector.isNull(1)); + + result = fromVector.getObject(2); + resultSet = (ArrayList) result; + assertEquals(4, resultSet.size()); + assertEquals(Long.valueOf(0), resultSet.get(0)); + assertEquals(Long.valueOf(-127), resultSet.get(1)); + assertEquals(Long.valueOf(127), resultSet.get(2)); + assertEquals(Long.valueOf(50), resultSet.get(3)); + + assertTrue(fromVector.isEmpty(3)); + + result = fromVector.getObject(4); + resultSet = (ArrayList) result; + assertEquals(2, resultSet.size()); + assertEquals(Long.valueOf(50), resultSet.get(0)); + assertEquals(Long.valueOf(12), resultSet.get(1)); + + fromVector.validate(); + + /* do split and transfer */ + try (ListViewVector toVector = ListViewVector.empty("toVector", allocator)) { + int[][] transferLengths = {{2, 3}, {0, 1}, {0, 3}}; + TransferPair transferPair = fromVector.makeTransferPair(toVector); + + for (final int[] transferLength : transferLengths) { + int start = transferLength[0]; + int splitLength = transferLength[1]; + validateSplitAndTransfer(transferPair, start, splitLength, fromVector, toVector); + } + } + } + } + private void writeIntValues(UnionListViewWriter writer, int[] values) { - writer.startList(); + writer.startListView(); for (int v : values) { writer.integer().writeInt(v); } - writer.endList(); + writer.endListView(); } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java index 40e55fce9bfa2..376ad3ec7504f 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java @@ -47,9 +47,11 @@ import org.apache.arrow.vector.complex.DenseUnionVector; import org.apache.arrow.vector.complex.FixedSizeListVector; import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.ListViewVector; import org.apache.arrow.vector.complex.StructVector; import org.apache.arrow.vector.complex.UnionVector; import org.apache.arrow.vector.complex.impl.NullableStructWriter; +import org.apache.arrow.vector.complex.impl.UnionListViewWriter; import org.apache.arrow.vector.complex.impl.UnionListWriter; import org.apache.arrow.vector.holders.NullableIntHolder; import org.apache.arrow.vector.holders.NullableUInt4Holder; @@ -2935,6 +2937,29 @@ public void testListVectorSetNull() { } } + @Test + public void testListViewVectorSetNull() { + try (final ListViewVector vector = ListViewVector.empty("listview", allocator)) { + UnionListViewWriter writer = vector.getWriter(); + writer.allocate(); + + writeListViewVector(writer, new int[] {1, 2}); + writeListViewVector(writer, new int[] {3, 4}); + writeListViewVector(writer, new int[] {5, 6}); + vector.setNull(3); + vector.setNull(4); + vector.setNull(5); + writer.setValueCount(6); + + assertEquals(vector.getObject(0), Arrays.asList(1, 2)); + assertEquals(vector.getObject(1), Arrays.asList(3, 4)); + assertEquals(vector.getObject(2), Arrays.asList(5, 6)); + assertTrue(vector.isNull(3)); + assertTrue(vector.isNull(4)); + assertTrue(vector.isNull(5)); + } + } + @Test public void testStructVectorEqualsWithNull() { @@ -3266,6 +3291,14 @@ private void writeListVector(UnionListWriter writer, int[] values) { writer.endList(); } + private void writeListViewVector(UnionListViewWriter writer, int[] values) { + writer.startListView(); + for (int v : values) { + writer.integer().writeInt(v); + } + writer.endListView(); + } + @Test public void testVariableVectorGetEndOffset() { try (final VarCharVector vector1 = new VarCharVector("v1", allocator); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java b/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java index 654940908bf38..2745386db4e22 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java @@ -47,6 +47,7 @@ import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.ViewVarCharVector; import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.ListViewVector; import org.apache.arrow.vector.complex.MapVector; import org.apache.arrow.vector.complex.NonNullableStructVector; import org.apache.arrow.vector.complex.StructVector; @@ -57,6 +58,8 @@ import org.apache.arrow.vector.complex.impl.SingleStructReaderImpl; import org.apache.arrow.vector.complex.impl.SingleStructWriter; import org.apache.arrow.vector.complex.impl.UnionListReader; +import org.apache.arrow.vector.complex.impl.UnionListViewReader; +import org.apache.arrow.vector.complex.impl.UnionListViewWriter; import org.apache.arrow.vector.complex.impl.UnionListWriter; import org.apache.arrow.vector.complex.impl.UnionMapReader; import org.apache.arrow.vector.complex.impl.UnionReader; @@ -116,6 +119,195 @@ public void terminate() throws Exception { allocator.close(); } + /* Test Utils */ + + private void checkNullableStruct(NonNullableStructVector structVector) { + StructReader rootReader = new SingleStructReaderImpl(structVector).reader("root"); + for (int i = 0; i < COUNT; i++) { + rootReader.setPosition(i); + assertTrue(rootReader.isSet(), "index is set: " + i); + FieldReader struct = rootReader.reader("struct"); + if (i % 2 == 0) { + assertTrue(struct.isSet(), "index is set: " + i); + assertNotNull(struct.readObject(), "index is set: " + i); + assertEquals(i, struct.reader("nested").readLong().longValue()); + } else { + assertFalse(struct.isSet(), "index is not set: " + i); + assertNull(struct.readObject(), "index is not set: " + i); + } + } + } + + private void createListTypeVectorWithScalarType(FieldWriter writer) { + for (int i = 0; i < COUNT; i++) { + writer.startList(); + for (int j = 0; j < i % 7; j++) { + if (j % 2 == 0) { + writer.writeInt(j); + } else { + IntHolder holder = new IntHolder(); + holder.value = j; + writer.write(holder); + } + } + writer.endList(); + } + } + + private void checkListTypeVectorWithScalarType(FieldReader reader) { + for (int i = 0; i < COUNT; i++) { + reader.setPosition(i); + for (int j = 0; j < i % 7; j++) { + reader.next(); + assertEquals(j, reader.reader().readInteger().intValue()); + } + } + } + + private void createListTypeVectorWithScalarNull(FieldWriter writer) { + for (int i = 0; i < COUNT; i++) { + writer.startList(); + for (int j = 0; j < i % 7; j++) { + if (j % 2 == 0) { + writer.writeNull(); + } else { + IntHolder holder = new IntHolder(); + holder.value = j; + writer.write(holder); + } + } + writer.endList(); + } + } + + private void checkListTypeVectorWithScalarNull(FieldReader reader) { + for (int i = 0; i < COUNT; i++) { + reader.setPosition(i); + for (int j = 0; j < i % 7; j++) { + reader.next(); + if (j % 2 == 0) { + assertFalse(reader.reader().isSet(), "index is set: " + j); + } else { + assertTrue(reader.reader().isSet(), "index is not set: " + j); + assertEquals(j, reader.reader().readInteger().intValue()); + } + } + } + } + + private void createListTypeVectorWithDecimalType(FieldWriter writer, DecimalHolder holder) { + holder.buffer = allocator.buffer(DecimalVector.TYPE_WIDTH); + ArrowType arrowType = new ArrowType.Decimal(10, 0, 128); + for (int i = 0; i < COUNT; i++) { + writer.startList(); + for (int j = 0; j < i % 7; j++) { + if (j % 4 == 0) { + writer.writeDecimal(new BigDecimal(j)); + } else if (j % 4 == 1) { + DecimalUtility.writeBigDecimalToArrowBuf( + new BigDecimal(j), holder.buffer, 0, DecimalVector.TYPE_WIDTH); + holder.start = 0; + holder.scale = 0; + holder.precision = 10; + writer.write(holder); + } else if (j % 4 == 2) { + DecimalUtility.writeBigDecimalToArrowBuf( + new BigDecimal(j), holder.buffer, 0, DecimalVector.TYPE_WIDTH); + writer.writeDecimal(0, holder.buffer, arrowType); + } else { + byte[] value = BigDecimal.valueOf(j).unscaledValue().toByteArray(); + writer.writeBigEndianBytesToDecimal(value, arrowType); + } + } + writer.endList(); + } + } + + private void checkListTypeVectorWithDecimalType(FieldReader reader) { + for (int i = 0; i < COUNT; i++) { + reader.setPosition(i); + for (int j = 0; j < i % 7; j++) { + reader.next(); + Object expected = new BigDecimal(j); + Object actual = reader.reader().readBigDecimal(); + assertEquals(expected, actual); + } + } + } + + private void createListTypeVectorWithTimeStampMilliTZType(FieldWriter writer) { + for (int i = 0; i < COUNT; i++) { + writer.startList(); + for (int j = 0; j < i % 7; j++) { + if (j % 2 == 0) { + writer.writeNull(); + } else { + TimeStampMilliTZHolder holder = new TimeStampMilliTZHolder(); + holder.timezone = "FakeTimeZone"; + holder.value = j; + writer.timeStampMilliTZ().write(holder); + } + } + writer.endList(); + } + } + + private void checkListTypeVectorWithTimeStampMilliTZType(FieldReader reader) { + for (int i = 0; i < COUNT; i++) { + reader.setPosition(i); + for (int j = 0; j < i % 7; j++) { + reader.next(); + if (j % 2 == 0) { + assertFalse(reader.reader().isSet(), "index is set: " + j); + } else { + NullableTimeStampMilliTZHolder actual = new NullableTimeStampMilliTZHolder(); + reader.reader().read(actual); + assertEquals(j, actual.value); + assertEquals("FakeTimeZone", actual.timezone); + } + } + } + } + + private void createNullsWithListWriters(FieldWriter writer) { + for (int i = 0; i < COUNT; i++) { + writer.setPosition(i); + if (i % 2 == 0) { + writer.startList(); + if (i % 4 == 0) { + writer.integer().writeNull(); + } else { + writer.integer().writeInt(i); + writer.integer().writeInt(i * 2); + } + writer.endList(); + } else { + writer.writeNull(); + } + } + } + + private void checkNullsWithListWriters(FieldReader reader) { + for (int i = 0; i < COUNT; i++) { + reader.setPosition(i); + if (i % 2 == 0) { + assertTrue(reader.isSet()); + reader.next(); + if (i % 4 == 0) { + assertNull(reader.reader().readInteger()); + } else { + assertEquals(i, reader.reader().readInteger().intValue()); + reader.next(); + assertEquals(i * 2, reader.reader().readInteger().intValue()); + } + } else { + assertFalse(reader.isSet()); + } + } + } + + /* Test Cases */ + @Test public void simpleNestedTypes() { NonNullableStructVector parent = populateStructVector(null); @@ -213,23 +405,6 @@ public void nullableStruct2() { } } - private void checkNullableStruct(NonNullableStructVector structVector) { - StructReader rootReader = new SingleStructReaderImpl(structVector).reader("root"); - for (int i = 0; i < COUNT; i++) { - rootReader.setPosition(i); - assertTrue(rootReader.isSet(), "index is set: " + i); - FieldReader struct = rootReader.reader("struct"); - if (i % 2 == 0) { - assertTrue(struct.isSet(), "index is set: " + i); - assertNotNull(struct.readObject(), "index is set: " + i); - assertEquals(i, struct.reader("nested").readLong().longValue()); - } else { - assertFalse(struct.isSet(), "index is not set: " + i); - assertNull(struct.readObject(), "index is not set: " + i); - } - } - } - @Test public void testList() { try (NonNullableStructVector parent = NonNullableStructVector.empty("parent", allocator)) { @@ -260,72 +435,259 @@ public void testList() { } } - @Test - public void listScalarType() { - try (ListVector listVector = ListVector.empty("list", allocator)) { - listVector.allocateNew(); - UnionListWriter listWriter = new UnionListWriter(listVector); - for (int i = 0; i < COUNT; i++) { - listWriter.startList(); + private void createListTypeVectorWithDurationType(FieldWriter writer) { + for (int i = 0; i < COUNT; i++) { + writer.startList(); + for (int j = 0; j < i % 7; j++) { + if (j % 2 == 0) { + writer.writeNull(); + } else { + DurationHolder holder = new DurationHolder(); + holder.unit = TimeUnit.MICROSECOND; + holder.value = j; + writer.duration().write(holder); + } + } + writer.endList(); + } + } + + private void checkListTypeVectorWithDurationType(FieldReader reader) { + for (int i = 0; i < COUNT; i++) { + reader.setPosition(i); + for (int j = 0; j < i % 7; j++) { + reader.next(); + if (j % 2 == 0) { + assertFalse(reader.reader().isSet(), "index is set: " + j); + } else { + NullableDurationHolder actual = new NullableDurationHolder(); + reader.reader().read(actual); + assertEquals(TimeUnit.MICROSECOND, actual.unit); + assertEquals(j, actual.value); + } + } + } + } + + private void createScalarTypeVectorWithNullableType(FieldWriter writer) { + for (int i = 0; i < COUNT; i++) { + if (i % 2 == 0) { + writer.setPosition(i); + writer.startList(); for (int j = 0; j < i % 7; j++) { - if (j % 2 == 0) { - listWriter.writeInt(j); + writer.writeInt(j); + } + writer.endList(); + } + } + } + + private void checkScalarTypeVectorWithNullableType(FieldReader reader) { + for (int i = 0; i < COUNT; i++) { + reader.setPosition(i); + if (i % 2 == 0) { + assertTrue(reader.isSet(), "index is set: " + i); + assertEquals(i % 7, ((List) reader.readObject()).size(), "correct length at: " + i); + } else { + assertFalse(reader.isSet(), "index is not set: " + i); + assertNull(reader.readObject(), "index is not set: " + i); + } + } + } + + private void createListTypeVectorWithStructType( + FieldWriter fieldWriter, StructWriter structWriter) { + for (int i = 0; i < COUNT; i++) { + fieldWriter.startList(); + for (int j = 0; j < i % 7; j++) { + structWriter.start(); + structWriter.integer("int").writeInt(j); + structWriter.bigInt("bigInt").writeBigInt(j); + structWriter.end(); + } + fieldWriter.endList(); + } + } + + private void checkListTypeVectorWithStructType(FieldReader reader) { + for (int i = 0; i < COUNT; i++) { + reader.setPosition(i); + for (int j = 0; j < i % 7; j++) { + reader.next(); + assertEquals(j, reader.reader().reader("int").readInteger().intValue(), "record: " + i); + assertEquals(j, reader.reader().reader("bigInt").readLong().longValue()); + } + } + } + + private void checkListOfListTypes(final FieldReader reader) { + for (int i = 0; i < COUNT; i++) { + reader.setPosition(i); + for (int j = 0; j < i % 7; j++) { + reader.next(); + FieldReader innerListReader = reader.reader(); + for (int k = 0; k < i % 13; k++) { + innerListReader.next(); + assertEquals(k, innerListReader.reader().readInteger().intValue(), "record: " + i); + } + } + } + } + + private void checkUnionListType(FieldReader reader) { + for (int i = 0; i < COUNT; i++) { + reader.setPosition(i); + for (int j = 0; j < i % 7; j++) { + reader.next(); + FieldReader innerListReader = reader.reader(); + for (int k = 0; k < i % 13; k++) { + innerListReader.next(); + if (k % 2 == 0) { + assertEquals(k, innerListReader.reader().readInteger().intValue(), "record: " + i); } else { - IntHolder holder = new IntHolder(); - holder.value = j; - listWriter.write(holder); + assertEquals(k, innerListReader.reader().readLong().longValue(), "record: " + i); } } - listWriter.endList(); } - listWriter.setValueCount(COUNT); - UnionListReader listReader = new UnionListReader(listVector); - for (int i = 0; i < COUNT; i++) { - listReader.setPosition(i); - for (int j = 0; j < i % 7; j++) { - listReader.next(); - assertEquals(j, listReader.reader().readInteger().intValue()); + } + } + + private static void createListTypeVectorWithMapType(FieldWriter writer) { + MapWriter innerMapWriter = writer.map(true); + for (int i = 0; i < COUNT; i++) { + writer.startList(); + for (int j = 0; j < i % 7; j++) { + innerMapWriter.startMap(); + for (int k = 0; k < i % 13; k++) { + innerMapWriter.startEntry(); + innerMapWriter.key().integer().writeInt(k); + if (k % 2 == 0) { + innerMapWriter.value().bigInt().writeBigInt(k); + } + innerMapWriter.endEntry(); } + innerMapWriter.endMap(); } + writer.endList(); + } + } + + private void checkListTypeMap(FieldReader reader) { + for (int i = 0; i < COUNT; i++) { + reader.setPosition(i); + for (int j = 0; j < i % 7; j++) { + reader.next(); + UnionMapReader mapReader = (UnionMapReader) reader.reader(); + for (int k = 0; k < i % 13; k++) { + mapReader.next(); + assertEquals(k, mapReader.key().readInteger().intValue(), "record key: " + i); + if (k % 2 == 0) { + assertEquals(k, mapReader.value().readLong().longValue(), "record value: " + i); + } else { + assertNull(mapReader.value().readLong(), "record value: " + i); + } + } + } + } + } + + /* Test Cases */ + + private void createListTypeVectorWithFixedSizeBinaryType( + FieldWriter writer, List buffers) { + for (int i = 0; i < COUNT; i++) { + writer.startList(); + for (int j = 0; j < i % 7; j++) { + if (j % 2 == 0) { + writer.writeNull(); + } else { + ArrowBuf buf = allocator.buffer(4); + buf.setInt(0, j); + FixedSizeBinaryHolder holder = new FixedSizeBinaryHolder(); + holder.byteWidth = 4; + holder.buffer = buf; + writer.fixedSizeBinary().write(holder); + buffers.add(buf); + } + } + writer.endList(); + } + } + + private void checkListTypeVectorWithFixedSizeBinaryType(FieldReader reader) { + for (int i = 0; i < COUNT; i++) { + reader.setPosition(i); + for (int j = 0; j < i % 7; j++) { + reader.next(); + if (j % 2 == 0) { + assertFalse(reader.reader().isSet(), "index is set: " + j); + } else { + NullableFixedSizeBinaryHolder actual = new NullableFixedSizeBinaryHolder(); + reader.reader().read(actual); + assertEquals(j, actual.buffer.getInt(0)); + assertEquals(4, actual.byteWidth); + } + } + } + } + + @Test + public void listScalarType() { + try (ListVector listVector = ListVector.empty("list", allocator)) { + listVector.allocateNew(); + UnionListWriter listWriter = new UnionListWriter(listVector); + createListTypeVectorWithScalarType(listWriter); + listWriter.setValueCount(COUNT); + UnionListReader listReader = new UnionListReader(listVector); + // validate + checkListTypeVectorWithScalarType(listReader); + } + } + + @Test + public void listViewScalarType() { + try (ListViewVector listViewVector = ListViewVector.empty("listview", allocator)) { + listViewVector.allocateNew(); + UnionListViewWriter listViewWriter = new UnionListViewWriter(listViewVector); + createListTypeVectorWithScalarType(listViewWriter); + listViewWriter.setValueCount(COUNT); + UnionListViewReader listViewReader = new UnionListViewReader(listViewVector); + // validate + checkListTypeVectorWithScalarType(listViewReader); } } @Test public void testListScalarNull() { - /* Write to a integer list vector - * each list of size 8 and having it's data values alternating between null and a non-null. + /* Write to an integer list vector + * each list of size 8 + * and having its data values alternating between null and a non-null. * Read and verify */ try (ListVector listVector = ListVector.empty("list", allocator)) { listVector.allocateNew(); UnionListWriter listWriter = new UnionListWriter(listVector); - for (int i = 0; i < COUNT; i++) { - listWriter.startList(); - for (int j = 0; j < i % 7; j++) { - if (j % 2 == 0) { - listWriter.writeNull(); - } else { - IntHolder holder = new IntHolder(); - holder.value = j; - listWriter.write(holder); - } - } - listWriter.endList(); - } + createListTypeVectorWithScalarNull(listWriter); listWriter.setValueCount(COUNT); UnionListReader listReader = new UnionListReader(listVector); - for (int i = 0; i < COUNT; i++) { - listReader.setPosition(i); - for (int j = 0; j < i % 7; j++) { - listReader.next(); - if (j % 2 == 0) { - assertFalse(listReader.reader().isSet(), "index is set: " + j); - } else { - assertTrue(listReader.reader().isSet(), "index is not set: " + j); - assertEquals(j, listReader.reader().readInteger().intValue()); - } - } - } + checkListTypeVectorWithScalarNull(listReader); + } + } + + @Test + public void testListViewScalarNull() { + /* Write to an integer list vector + * each list of size 8 + * and having its data values alternating between null and a non-null. + * Read and verify + */ + try (ListViewVector listViewVector = ListViewVector.empty("listview", allocator)) { + listViewVector.allocateNew(); + UnionListViewWriter listViewWriter = new UnionListViewWriter(listViewVector); + createListTypeVectorWithScalarNull(listViewWriter); + listViewWriter.setValueCount(COUNT); + UnionListViewReader listViewReader = new UnionListViewReader(listViewVector); + checkListTypeVectorWithScalarNull(listViewReader); } } @@ -335,42 +697,24 @@ public void listDecimalType() { listVector.allocateNew(); UnionListWriter listWriter = new UnionListWriter(listVector); DecimalHolder holder = new DecimalHolder(); - holder.buffer = allocator.buffer(DecimalVector.TYPE_WIDTH); - ArrowType arrowType = new ArrowType.Decimal(10, 0, 128); - for (int i = 0; i < COUNT; i++) { - listWriter.startList(); - for (int j = 0; j < i % 7; j++) { - if (j % 4 == 0) { - listWriter.writeDecimal(new BigDecimal(j)); - } else if (j % 4 == 1) { - DecimalUtility.writeBigDecimalToArrowBuf( - new BigDecimal(j), holder.buffer, 0, DecimalVector.TYPE_WIDTH); - holder.start = 0; - holder.scale = 0; - holder.precision = 10; - listWriter.write(holder); - } else if (j % 4 == 2) { - DecimalUtility.writeBigDecimalToArrowBuf( - new BigDecimal(j), holder.buffer, 0, DecimalVector.TYPE_WIDTH); - listWriter.writeDecimal(0, holder.buffer, arrowType); - } else { - byte[] value = BigDecimal.valueOf(j).unscaledValue().toByteArray(); - listWriter.writeBigEndianBytesToDecimal(value, arrowType); - } - } - listWriter.endList(); - } + createListTypeVectorWithDecimalType(listWriter, holder); listWriter.setValueCount(COUNT); UnionListReader listReader = new UnionListReader(listVector); - for (int i = 0; i < COUNT; i++) { - listReader.setPosition(i); - for (int j = 0; j < i % 7; j++) { - listReader.next(); - Object expected = new BigDecimal(j); - Object actual = listReader.reader().readBigDecimal(); - assertEquals(expected, actual); - } - } + checkListTypeVectorWithDecimalType(listReader); + holder.buffer.close(); + } + } + + @Test + public void listViewDecimalType() { + try (ListViewVector listViewVector = ListViewVector.empty("listview", allocator)) { + listViewVector.allocateNew(); + UnionListViewWriter listViewWriter = new UnionListViewWriter(listViewVector); + DecimalHolder holder = new DecimalHolder(); + createListTypeVectorWithDecimalType(listViewWriter, holder); + listViewWriter.setValueCount(COUNT); + UnionListViewReader listViewReader = new UnionListViewReader(listViewVector); + checkListTypeVectorWithDecimalType(listViewReader); holder.buffer.close(); } } @@ -380,36 +724,22 @@ public void listTimeStampMilliTZType() { try (ListVector listVector = ListVector.empty("list", allocator)) { listVector.allocateNew(); UnionListWriter listWriter = new UnionListWriter(listVector); - for (int i = 0; i < COUNT; i++) { - listWriter.startList(); - for (int j = 0; j < i % 7; j++) { - if (j % 2 == 0) { - listWriter.writeNull(); - } else { - TimeStampMilliTZHolder holder = new TimeStampMilliTZHolder(); - holder.timezone = "FakeTimeZone"; - holder.value = j; - listWriter.timeStampMilliTZ().write(holder); - } - } - listWriter.endList(); - } + createListTypeVectorWithTimeStampMilliTZType(listWriter); listWriter.setValueCount(COUNT); UnionListReader listReader = new UnionListReader(listVector); - for (int i = 0; i < COUNT; i++) { - listReader.setPosition(i); - for (int j = 0; j < i % 7; j++) { - listReader.next(); - if (j % 2 == 0) { - assertFalse(listReader.reader().isSet(), "index is set: " + j); - } else { - NullableTimeStampMilliTZHolder actual = new NullableTimeStampMilliTZHolder(); - listReader.reader().read(actual); - assertEquals(j, actual.value); - assertEquals("FakeTimeZone", actual.timezone); - } - } - } + checkListTypeVectorWithTimeStampMilliTZType(listReader); + } + } + + @Test + public void listViewTimeStampMilliTZType() { + try (ListViewVector listViewVector = ListViewVector.empty("listview", allocator)) { + listViewVector.allocateNew(); + UnionListViewWriter listViewWriter = new UnionListViewWriter(listViewVector); + createListTypeVectorWithTimeStampMilliTZType(listViewWriter); + listViewWriter.setValueCount(COUNT); + UnionListViewReader listViewReader = new UnionListViewReader(listViewVector); + checkListTypeVectorWithTimeStampMilliTZType(listViewReader); } } @@ -418,80 +748,51 @@ public void listDurationType() { try (ListVector listVector = ListVector.empty("list", allocator)) { listVector.allocateNew(); UnionListWriter listWriter = new UnionListWriter(listVector); - for (int i = 0; i < COUNT; i++) { - listWriter.startList(); - for (int j = 0; j < i % 7; j++) { - if (j % 2 == 0) { - listWriter.writeNull(); - } else { - DurationHolder holder = new DurationHolder(); - holder.unit = TimeUnit.MICROSECOND; - holder.value = j; - listWriter.duration().write(holder); - } - } - listWriter.endList(); - } + createListTypeVectorWithDurationType(listWriter); listWriter.setValueCount(COUNT); UnionListReader listReader = new UnionListReader(listVector); - for (int i = 0; i < COUNT; i++) { - listReader.setPosition(i); - for (int j = 0; j < i % 7; j++) { - listReader.next(); - if (j % 2 == 0) { - assertFalse(listReader.reader().isSet(), "index is set: " + j); - } else { - NullableDurationHolder actual = new NullableDurationHolder(); - listReader.reader().read(actual); - assertEquals(TimeUnit.MICROSECOND, actual.unit); - assertEquals(j, actual.value); - } - } - } + checkListTypeVectorWithDurationType(listReader); + } + } + + @Test + public void listViewDurationType() { + try (ListViewVector listViewVector = ListViewVector.empty("listview", allocator)) { + listViewVector.allocateNew(); + UnionListViewWriter listViewWriter = new UnionListViewWriter(listViewVector); + createListTypeVectorWithDurationType(listViewWriter); + listViewWriter.setValueCount(COUNT); + UnionListViewReader listReader = new UnionListViewReader(listViewVector); + checkListTypeVectorWithDurationType(listReader); } } @Test public void listFixedSizeBinaryType() throws Exception { - List bufs = new ArrayList(); + List buffers = new ArrayList<>(); try (ListVector listVector = ListVector.empty("list", allocator)) { listVector.allocateNew(); UnionListWriter listWriter = new UnionListWriter(listVector); - for (int i = 0; i < COUNT; i++) { - listWriter.startList(); - for (int j = 0; j < i % 7; j++) { - if (j % 2 == 0) { - listWriter.writeNull(); - } else { - ArrowBuf buf = allocator.buffer(4); - buf.setInt(0, j); - FixedSizeBinaryHolder holder = new FixedSizeBinaryHolder(); - holder.byteWidth = 4; - holder.buffer = buf; - listWriter.fixedSizeBinary().write(holder); - bufs.add(buf); - } - } - listWriter.endList(); - } + createListTypeVectorWithFixedSizeBinaryType(listWriter, buffers); listWriter.setValueCount(COUNT); - UnionListReader listReader = new UnionListReader(listVector); - for (int i = 0; i < COUNT; i++) { - listReader.setPosition(i); - for (int j = 0; j < i % 7; j++) { - listReader.next(); - if (j % 2 == 0) { - assertFalse(listReader.reader().isSet(), "index is set: " + j); - } else { - NullableFixedSizeBinaryHolder actual = new NullableFixedSizeBinaryHolder(); - listReader.reader().read(actual); - assertEquals(j, actual.buffer.getInt(0)); - assertEquals(4, actual.byteWidth); - } - } - } + UnionListReader listReader = new UnionListReader(listVector); + checkListTypeVectorWithFixedSizeBinaryType(listReader); } - AutoCloseables.close(bufs); + AutoCloseables.close(buffers); + } + + @Test + public void listViewFixedSizeBinaryType() throws Exception { + List buffers = new ArrayList<>(); + try (ListViewVector listViewVector = ListViewVector.empty("listview", allocator)) { + listViewVector.allocateNew(); + UnionListViewWriter listViewWriter = new UnionListViewWriter(listViewVector); + createListTypeVectorWithFixedSizeBinaryType(listViewWriter, buffers); + listViewWriter.setValueCount(COUNT); + UnionListViewReader listReader = new UnionListViewReader(listViewVector); + checkListTypeVectorWithFixedSizeBinaryType(listReader); + } + AutoCloseables.close(buffers); } @Test @@ -499,29 +800,22 @@ public void listScalarTypeNullable() { try (ListVector listVector = ListVector.empty("list", allocator)) { listVector.allocateNew(); UnionListWriter listWriter = new UnionListWriter(listVector); - for (int i = 0; i < COUNT; i++) { - if (i % 2 == 0) { - listWriter.setPosition(i); - listWriter.startList(); - for (int j = 0; j < i % 7; j++) { - listWriter.writeInt(j); - } - listWriter.endList(); - } - } + createScalarTypeVectorWithNullableType(listWriter); listWriter.setValueCount(COUNT); UnionListReader listReader = new UnionListReader(listVector); - for (int i = 0; i < COUNT; i++) { - listReader.setPosition(i); - if (i % 2 == 0) { - assertTrue(listReader.isSet(), "index is set: " + i); - assertEquals( - i % 7, ((List) listReader.readObject()).size(), "correct length at: " + i); - } else { - assertFalse(listReader.isSet(), "index is not set: " + i); - assertNull(listReader.readObject(), "index is not set: " + i); - } - } + checkScalarTypeVectorWithNullableType(listReader); + } + } + + @Test + public void listViewScalarTypeNullable() { + try (ListViewVector listViewVector = ListViewVector.empty("listview", allocator)) { + listViewVector.allocateNew(); + UnionListViewWriter listViewWriter = new UnionListViewWriter(listViewVector); + createScalarTypeVectorWithNullableType(listViewWriter); + listViewWriter.setValueCount(COUNT); + UnionListViewReader listReader = new UnionListViewReader(listViewVector); + checkScalarTypeVectorWithNullableType(listReader); } } @@ -529,29 +823,25 @@ public void listScalarTypeNullable() { public void listStructType() { try (ListVector listVector = ListVector.empty("list", allocator)) { listVector.allocateNew(); - UnionListWriter listWriter = new UnionListWriter(listVector); - StructWriter structWriter = listWriter.struct(); - for (int i = 0; i < COUNT; i++) { - listWriter.startList(); - for (int j = 0; j < i % 7; j++) { - structWriter.start(); - structWriter.integer("int").writeInt(j); - structWriter.bigInt("bigInt").writeBigInt(j); - structWriter.end(); - } - listWriter.endList(); - } - listWriter.setValueCount(COUNT); + UnionListWriter listViewWriter = new UnionListWriter(listVector); + StructWriter structWriter = listViewWriter.struct(); + createListTypeVectorWithStructType(listViewWriter, structWriter); + listViewWriter.setValueCount(COUNT); UnionListReader listReader = new UnionListReader(listVector); - for (int i = 0; i < COUNT; i++) { - listReader.setPosition(i); - for (int j = 0; j < i % 7; j++) { - listReader.next(); - assertEquals( - j, listReader.reader().reader("int").readInteger().intValue(), "record: " + i); - assertEquals(j, listReader.reader().reader("bigInt").readLong().longValue()); - } - } + checkListTypeVectorWithStructType(listReader); + } + } + + @Test + public void listViewStructType() { + try (ListViewVector listViewVector = ListViewVector.empty("listview", allocator)) { + listViewVector.allocateNew(); + UnionListViewWriter listViewWriter = new UnionListViewWriter(listViewVector); + StructWriter structWriter = listViewWriter.struct(); + createListTypeVectorWithStructType(listViewWriter, structWriter); + listViewWriter.setValueCount(COUNT); + UnionListViewReader listReader = new UnionListViewReader(listViewVector); + checkListTypeVectorWithStructType(listReader); } } @@ -573,7 +863,31 @@ public void listListType() { listWriter.endList(); } listWriter.setValueCount(COUNT); - checkListOfLists(listVector); + UnionListReader listReader = new UnionListReader(listVector); + checkListOfListTypes(listReader); + } + } + + @Test + public void listViewListType() { + try (ListViewVector listViewVector = ListViewVector.empty("listview", allocator)) { + listViewVector.allocateNew(); + UnionListViewWriter listViewWriter = new UnionListViewWriter(listViewVector); + for (int i = 0; i < COUNT; i++) { + listViewWriter.startListView(); + for (int j = 0; j < i % 7; j++) { + ListWriter innerListWriter = listViewWriter.listView(); + innerListWriter.startListView(); + for (int k = 0; k < i % 13; k++) { + innerListWriter.integer().writeInt(k); + } + innerListWriter.endListView(); + } + listViewWriter.endListView(); + } + listViewWriter.setValueCount(COUNT); + UnionListViewReader listReader = new UnionListViewReader(listViewVector); + checkListOfListTypes(listReader); } } @@ -587,7 +901,6 @@ public void listListType2() { listVector.allocateNew(); UnionListWriter listWriter = new UnionListWriter(listVector); ListWriter innerListWriter = listWriter.list(); - for (int i = 0; i < COUNT; i++) { listWriter.startList(); for (int j = 0; j < i % 7; j++) { @@ -600,22 +913,31 @@ public void listListType2() { listWriter.endList(); } listWriter.setValueCount(COUNT); - checkListOfLists(listVector); + UnionListReader listReader = new UnionListReader(listVector); + checkListOfListTypes(listReader); } } - private void checkListOfLists(final ListVector listVector) { - UnionListReader listReader = new UnionListReader(listVector); - for (int i = 0; i < COUNT; i++) { - listReader.setPosition(i); - for (int j = 0; j < i % 7; j++) { - listReader.next(); - FieldReader innerListReader = listReader.reader(); - for (int k = 0; k < i % 13; k++) { - innerListReader.next(); - assertEquals(k, innerListReader.reader().readInteger().intValue(), "record: " + i); + @Test + public void listViewListType2() { + try (ListViewVector listViewVector = ListViewVector.empty("listview", allocator)) { + listViewVector.allocateNew(); + UnionListViewWriter listViewWriter = new UnionListViewWriter(listViewVector); + ListWriter innerListWriter = listViewWriter.list(); + for (int i = 0; i < COUNT; i++) { + listViewWriter.startListView(); + for (int j = 0; j < i % 7; j++) { + innerListWriter.startListView(); + for (int k = 0; k < i % 13; k++) { + innerListWriter.integer().writeInt(k); + } + innerListWriter.endListView(); } + listViewWriter.endListView(); } + listViewWriter.setValueCount(COUNT); + UnionListViewReader listReader = new UnionListViewReader(listViewVector); + checkListOfListTypes(listReader); } } @@ -641,7 +963,35 @@ public void unionListListType() { listWriter.endList(); } listWriter.setValueCount(COUNT); - checkUnionList(listVector); + UnionListReader listReader = new UnionListReader(listVector); + checkUnionListType(listReader); + } + } + + @Test + public void unionListViewListType() { + try (ListViewVector listViewVector = ListViewVector.empty("listview", allocator)) { + listViewVector.allocateNew(); + UnionListViewWriter listViewWriter = new UnionListViewWriter(listViewVector); + for (int i = 0; i < COUNT; i++) { + listViewWriter.startList(); + for (int j = 0; j < i % 7; j++) { + ListWriter innerListWriter = listViewWriter.listView(); + innerListWriter.startListView(); + for (int k = 0; k < i % 13; k++) { + if (k % 2 == 0) { + innerListWriter.integer().writeInt(k); + } else { + innerListWriter.bigInt().writeBigInt(k); + } + } + innerListWriter.endListView(); + } + listViewWriter.endListView(); + } + listViewWriter.setValueCount(COUNT); + UnionListViewReader listViewReader = new UnionListViewReader(listViewVector); + checkUnionListType(listViewReader); } } @@ -654,8 +1004,7 @@ public void unionListListType2() { try (ListVector listVector = ListVector.empty("list", allocator)) { listVector.allocateNew(); UnionListWriter listWriter = new UnionListWriter(listVector); - ListWriter innerListWriter = listWriter.list(); - + ListWriter innerListWriter = listWriter.listView(); for (int i = 0; i < COUNT; i++) { listWriter.startList(); for (int j = 0; j < i % 7; j++) { @@ -672,26 +1021,39 @@ public void unionListListType2() { listWriter.endList(); } listWriter.setValueCount(COUNT); - checkUnionList(listVector); + UnionListReader listReader = new UnionListReader(listVector); + checkUnionListType(listReader); } } - private void checkUnionList(ListVector listVector) { - UnionListReader listReader = new UnionListReader(listVector); - for (int i = 0; i < COUNT; i++) { - listReader.setPosition(i); - for (int j = 0; j < i % 7; j++) { - listReader.next(); - FieldReader innerListReader = listReader.reader(); - for (int k = 0; k < i % 13; k++) { - innerListReader.next(); - if (k % 2 == 0) { - assertEquals(k, innerListReader.reader().readInteger().intValue(), "record: " + i); - } else { - assertEquals(k, innerListReader.reader().readLong().longValue(), "record: " + i); + /** + * This test is similar to {@link #unionListViewListType()} but we get the inner list writer once + * at the beginning. + */ + @Test + public void unionListViewListType2() { + try (ListViewVector listViewVector = ListViewVector.empty("listview", allocator)) { + listViewVector.allocateNew(); + UnionListViewWriter listViewWriter = new UnionListViewWriter(listViewVector); + ListWriter innerListWriter = listViewWriter.listView(); + for (int i = 0; i < COUNT; i++) { + listViewWriter.startListView(); + for (int j = 0; j < i % 7; j++) { + innerListWriter.startListView(); + for (int k = 0; k < i % 13; k++) { + if (k % 2 == 0) { + innerListWriter.integer().writeInt(k); + } else { + innerListWriter.bigInt().writeBigInt(k); + } } + innerListWriter.endListView(); } + listViewWriter.endListView(); } + listViewWriter.setValueCount(COUNT); + UnionListViewReader listViewReader = new UnionListViewReader(listViewVector); + checkUnionListType(listViewReader); } } @@ -700,27 +1062,11 @@ public void testListMapType() { try (ListVector listVector = ListVector.empty("list", allocator)) { listVector.allocateNew(); UnionListWriter listWriter = new UnionListWriter(listVector); - MapWriter innerMapWriter = listWriter.map(true); - for (int i = 0; i < COUNT; i++) { - listWriter.startList(); - for (int j = 0; j < i % 7; j++) { - innerMapWriter.startMap(); - for (int k = 0; k < i % 13; k++) { - innerMapWriter.startEntry(); - innerMapWriter.key().integer().writeInt(k); - if (k % 2 == 0) { - innerMapWriter.value().bigInt().writeBigInt(k); - } - innerMapWriter.endEntry(); - } - innerMapWriter.endMap(); - } - listWriter.endList(); - } + createListTypeVectorWithMapType(listWriter); listWriter.setValueCount(COUNT); - checkListMap(listVector); - + UnionListReader listReader = new UnionListReader(listVector); + checkListTypeMap(listReader); // Verify that the map vector has keysSorted = true MapVector mapVector = (MapVector) listVector.getDataVector(); ArrowType arrowType = mapVector.getField().getFieldType().getType(); @@ -728,23 +1074,20 @@ public void testListMapType() { } } - private void checkListMap(ListVector listVector) { - UnionListReader listReader = new UnionListReader(listVector); - for (int i = 0; i < COUNT; i++) { - listReader.setPosition(i); - for (int j = 0; j < i % 7; j++) { - listReader.next(); - UnionMapReader mapReader = (UnionMapReader) listReader.reader(); - for (int k = 0; k < i % 13; k++) { - mapReader.next(); - assertEquals(k, mapReader.key().readInteger().intValue(), "record key: " + i); - if (k % 2 == 0) { - assertEquals(k, mapReader.value().readLong().longValue(), "record value: " + i); - } else { - assertNull(mapReader.value().readLong(), "record value: " + i); - } - } - } + @Test + public void testListViewMapType() { + try (ListViewVector listViewVector = ListViewVector.empty("listview", allocator)) { + listViewVector.allocateNew(); + UnionListViewWriter listViewWriter = new UnionListViewWriter(listViewVector); + + createListTypeVectorWithMapType(listViewWriter); + listViewWriter.setValueCount(COUNT); + UnionListViewReader listViewReader = new UnionListViewReader(listViewVector); + checkListTypeMap(listViewReader); + // Verify that the map vector has keysSorted = true + MapVector mapVector = (MapVector) listViewVector.getDataVector(); + ArrowType arrowType = mapVector.getField().getFieldType().getType(); + assertTrue(((ArrowType.Map) arrowType).getKeysSorted()); } } @@ -1212,6 +1555,7 @@ public void complexCopierWithList() { ComplexWriter writer = new ComplexWriterImpl("root", parent); StructWriter rootWriter = writer.rootAsStruct(); ListWriter listWriter = rootWriter.list("list"); + StructWriter innerStructWriter = listWriter.struct(); IntWriter outerIntWriter = listWriter.integer(); rootWriter.start(); @@ -1246,6 +1590,47 @@ public void complexCopierWithList() { } } + @Test + public void complexCopierWithListView() { + try (NonNullableStructVector parent = NonNullableStructVector.empty("parent", allocator)) { + ComplexWriter writer = new ComplexWriterImpl("root", parent); + StructWriter rootWriter = writer.rootAsStruct(); + ListWriter listViewWriter = rootWriter.listView("listView"); + + StructWriter innerStructWriter = listViewWriter.struct(); + IntWriter outerIntWriter = listViewWriter.integer(); + rootWriter.start(); + listViewWriter.startListView(); + outerIntWriter.writeInt(1); + outerIntWriter.writeInt(2); + innerStructWriter.start(); + IntWriter intWriter = innerStructWriter.integer("a"); + intWriter.writeInt(1); + innerStructWriter.end(); + innerStructWriter.start(); + intWriter = innerStructWriter.integer("a"); + intWriter.writeInt(2); + innerStructWriter.end(); + listViewWriter.endListView(); + rootWriter.end(); + writer.setValueCount(1); + + StructVector structVector = (StructVector) parent.getChild("root"); + TransferPair tp = structVector.getTransferPair(allocator); + tp.splitAndTransfer(0, 1); + NonNullableStructVector toStructVector = (NonNullableStructVector) tp.getTo(); + JsonStringHashMap toMapValue = (JsonStringHashMap) toStructVector.getObject(0); + JsonStringArrayList object = (JsonStringArrayList) toMapValue.get("listView"); + assertEquals(1, object.get(0)); + assertEquals(2, object.get(1)); + JsonStringHashMap innerStruct = (JsonStringHashMap) object.get(2); + assertEquals(1, innerStruct.get("a")); + innerStruct = (JsonStringHashMap) object.get(3); + assertEquals(2, innerStruct.get("a")); + toStructVector.close(); + } + } + @Test public void testSingleStructWriter1() { /* initialize a SingleStructWriter with empty StructVector and then lazily @@ -1262,6 +1647,7 @@ public void testSingleStructWriter1() { Float4Writer float4Writer = singleStructWriter.float4("float4Field"); Float8Writer float8Writer = singleStructWriter.float8("float8Field"); ListWriter listWriter = singleStructWriter.list("listField"); + ListWriter listViewWriter = singleStructWriter.listView("listViewField"); MapWriter mapWriter = singleStructWriter.map("mapField", false); int intValue = 100; @@ -1285,6 +1671,14 @@ public void testSingleStructWriter1() { listWriter.integer().writeInt(intValue + i + 3); listWriter.endList(); + listViewWriter.setPosition(i); + listViewWriter.startListView(); + listViewWriter.integer().writeInt(intValue + i); + listViewWriter.integer().writeInt(intValue + i + 1); + listViewWriter.integer().writeInt(intValue + i + 2); + listViewWriter.integer().writeInt(intValue + i + 3); + listViewWriter.endListView(); + mapWriter.setPosition(i); mapWriter.startMap(); mapWriter.startEntry(); @@ -1323,6 +1717,8 @@ public void testSingleStructWriter1() { Float4Reader float4Reader = singleStructReader.reader("float4Field"); Float8Reader float8Reader = singleStructReader.reader("float8Field"); UnionListReader listReader = (UnionListReader) singleStructReader.reader("listField"); + UnionListViewReader listViewReader = + (UnionListViewReader) singleStructReader.reader("listViewField"); UnionMapReader mapReader = (UnionMapReader) singleStructReader.reader("mapField"); for (int i = 0; i < initialCapacity; i++) { @@ -1331,6 +1727,7 @@ public void testSingleStructWriter1() { float4Reader.setPosition(i); float8Reader.setPosition(i); listReader.setPosition(i); + listViewReader.setPosition(i); mapReader.setPosition(i); assertEquals(intValue + i, intReader.readInteger().intValue()); @@ -1343,6 +1740,11 @@ public void testSingleStructWriter1() { assertEquals(intValue + i + j, listReader.reader().readInteger().intValue()); } + for (int j = 0; j < 4; j++) { + listViewReader.next(); + assertEquals(intValue + i + j, listViewReader.reader().readInteger().intValue()); + } + for (int k = 0; k < 4; k += 2) { mapReader.next(); assertEquals(intValue + k + i, mapReader.key().readInteger().intValue()); @@ -1362,40 +1764,31 @@ public void testListWriterWithNulls() { UnionListWriter listWriter = listVector.getWriter(); // expected listVector : [[null], null, [2, 4], null, [null], null, [6, 12], ...] - for (int i = 0; i < COUNT; i++) { - listWriter.setPosition(i); - if (i % 2 == 0) { - listWriter.startList(); - if (i % 4 == 0) { - listWriter.integer().writeNull(); - } else { - listWriter.integer().writeInt(i); - listWriter.integer().writeInt(i * 2); - } - listWriter.endList(); - } else { - listWriter.writeNull(); - } - } + createNullsWithListWriters(listWriter); listVector.setValueCount(COUNT); UnionListReader listReader = new UnionListReader(listVector); - for (int i = 0; i < COUNT; i++) { - listReader.setPosition(i); - if (i % 2 == 0) { - assertTrue(listReader.isSet()); - listReader.next(); - if (i % 4 == 0) { - assertNull(listReader.reader().readInteger()); - } else { - assertEquals(i, listReader.reader().readInteger().intValue()); - listReader.next(); - assertEquals(i * 2, listReader.reader().readInteger().intValue()); - } - } else { - assertFalse(listReader.isSet()); - } - } + checkNullsWithListWriters(listReader); + } + } + + @Test + public void testListViewWriterWithNulls() { + try (ListViewVector listViewVector = ListViewVector.empty("listView", allocator)) { + listViewVector.setInitialCapacity(COUNT); + listViewVector.allocateNew(); + listViewVector + .getValidityBuffer() + .setOne(0, (int) listViewVector.getValidityBuffer().capacity()); + + UnionListViewWriter listWriter = listViewVector.getWriter(); + + // expected listVector : [[null], null, [2, 4], null, [null], null, [6, 12], ...] + createNullsWithListWriters(listWriter); + listViewVector.setValueCount(COUNT); + + UnionListViewReader listReader = new UnionListViewReader(listViewVector); + checkNullsWithListWriters(listReader); } } @@ -1452,6 +1845,61 @@ public void testListOfListWriterWithNulls() { } } + @Test + public void testListViewOfListViewWriterWithNulls() { + try (ListViewVector listViewVector = ListViewVector.empty("listViewoflistView", allocator)) { + listViewVector.setInitialCapacity(COUNT); + listViewVector.allocateNew(); + listViewVector + .getValidityBuffer() + .setOne(0, (int) listViewVector.getValidityBuffer().capacity()); + + UnionListViewWriter listViewWriter = listViewVector.getWriter(); + + // create list : [ [null], null, [[null, 2, 4]], null, [null], null, [[null, 6, 12]], ... ] + for (int i = 0; i < COUNT; i++) { + listViewWriter.setPosition(i); + if (i % 2 == 0) { + listViewWriter.startListView(); + if (i % 4 == 0) { + listViewWriter.listView().writeNull(); + } else { + listViewWriter.listView().startListView(); + listViewWriter.listView().integer().writeNull(); + listViewWriter.listView().integer().writeInt(i); + listViewWriter.listView().integer().writeInt(i * 2); + listViewWriter.listView().endListView(); + } + listViewWriter.endListView(); + } else { + listViewWriter.writeNull(); + } + } + listViewVector.setValueCount(COUNT); + + UnionListViewReader listViewReader = new UnionListViewReader(listViewVector); + for (int i = 0; i < COUNT; i++) { + listViewReader.setPosition(i); + if (i % 2 == 0) { + assertTrue(listViewReader.isSet()); + listViewReader.next(); + if (i % 4 == 0) { + assertFalse(listViewReader.reader().isSet()); + } else { + listViewReader.reader().next(); + assertFalse(listViewReader.reader().reader().isSet()); + listViewReader.reader().next(); + assertEquals(i, listViewReader.reader().reader().readInteger().intValue()); + listViewReader.reader().next(); + assertEquals(i * 2, listViewReader.reader().reader().readInteger().intValue()); + } + } else { + assertFalse(listViewReader.isSet()); + } + } + } + } + @Test public void testListOfListOfListWriterWithNulls() { try (ListVector listVector = ListVector.empty("listoflistoflist", allocator)) { @@ -1515,6 +1963,72 @@ public void testListOfListOfListWriterWithNulls() { } } + @Test + public void testListViewOfListViewOfListViewWriterWithNulls() { + try (ListViewVector listViewVector = + ListViewVector.empty("listViewoflistViewoflistView", allocator)) { + listViewVector.setInitialCapacity(COUNT); + listViewVector.allocateNew(); + listViewVector + .getValidityBuffer() + .setOne(0, (int) listViewVector.getValidityBuffer().capacity()); + + UnionListViewWriter listViewWriter = listViewVector.getWriter(); + + // create list : [ null, [null], [[null]], [[[null, 1, 2]]], null, [null], ... + for (int i = 0; i < COUNT; i++) { + listViewWriter.setPosition(i); + if (i % 4 == 0) { + listViewWriter.writeNull(); + } else { + listViewWriter.startListView(); + if (i % 4 == 1) { + listViewWriter.listView().writeNull(); + } else if (i % 4 == 2) { + listViewWriter.listView().startListView(); + listViewWriter.listView().listView().writeNull(); + listViewWriter.listView().endListView(); + } else { + listViewWriter.listView().startListView(); + listViewWriter.listView().listView().startListView(); + listViewWriter.listView().listView().integer().writeNull(); + listViewWriter.listView().listView().integer().writeInt(i); + listViewWriter.listView().listView().integer().writeInt(i * 2); + listViewWriter.listView().listView().endListView(); + listViewWriter.listView().endListView(); + } + listViewWriter.endListView(); + } + } + listViewVector.setValueCount(COUNT); + + UnionListViewReader listViewReader = new UnionListViewReader(listViewVector); + for (int i = 0; i < COUNT; i++) { + listViewReader.setPosition(i); + if (i % 4 == 0) { + assertFalse(listViewReader.isSet()); + } else { + assertTrue(listViewReader.isSet()); + listViewReader.next(); + if (i % 4 == 1) { + assertFalse(listViewReader.reader().isSet()); + } else if (i % 4 == 2) { + listViewReader.reader().next(); + assertFalse(listViewReader.reader().reader().isSet()); + } else { + listViewReader.reader().next(); + listViewReader.reader().reader().next(); + assertFalse(listViewReader.reader().reader().reader().isSet()); + listViewReader.reader().reader().next(); + assertEquals(i, listViewReader.reader().reader().reader().readInteger().intValue()); + listViewReader.reader().reader().next(); + assertEquals(i * 2, listViewReader.reader().reader().reader().readInteger().intValue()); + } + } + } + } + } + @Test public void testStructOfList() { try (StructVector structVector = StructVector.empty("struct1", allocator)) { diff --git a/java/vector/src/test/java/org/apache/arrow/vector/ipc/BaseFileTest.java b/java/vector/src/test/java/org/apache/arrow/vector/ipc/BaseFileTest.java index c18f6faeb548f..281f050dfb662 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/ipc/BaseFileTest.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/ipc/BaseFileTest.java @@ -579,7 +579,7 @@ public void validateUnionData(int count, VectorSchemaRoot root) { FieldReader unionReader = root.getVector("union").getReader(); for (int i = 0; i < count; i++) { unionReader.setPosition(i); - switch (i % 4) { + switch (i % 5) { case 0: assertEquals(i, unionReader.readInteger().intValue()); break; @@ -590,6 +590,9 @@ public void validateUnionData(int count, VectorSchemaRoot root) { assertEquals(i % 3, unionReader.size()); break; case 3: + assertEquals(3, unionReader.size()); + break; + case 4: NullableTimeStampMilliHolder h = new NullableTimeStampMilliHolder(); unionReader.reader("timestamp").read(h); assertEquals(i, h.value); @@ -612,9 +615,10 @@ public void writeUnionData(int count, StructVector parent) { IntWriter intWriter = rootWriter.integer("union"); BigIntWriter bigIntWriter = rootWriter.bigInt("union"); ListWriter listWriter = rootWriter.list("union"); + ListWriter listViewWriter = rootWriter.listView("union"); StructWriter structWriter = rootWriter.struct("union"); for (int i = 0; i < count; i++) { - switch (i % 4) { + switch (i % 5) { case 0: intWriter.setPosition(i); intWriter.writeInt(i); @@ -632,6 +636,14 @@ public void writeUnionData(int count, StructVector parent) { listWriter.endList(); break; case 3: + listViewWriter.setPosition(i); + listViewWriter.startListView(); + for (int j = 0; j < i % 5; j++) { + listViewWriter.varChar().writeVarChar(0, 3, varchar); + } + listViewWriter.endListView(); + break; + case 4: structWriter.setPosition(i); structWriter.start(); structWriter.timeStampMilli("timestamp").writeTimeStampMilli(i); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java b/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java index 9b2c80ef181d1..69e16dc470351 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java @@ -61,9 +61,11 @@ import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.VariableWidthFieldVector; import org.apache.arrow.vector.complex.BaseRepeatedValueVector; +import org.apache.arrow.vector.complex.BaseRepeatedValueViewVector; import org.apache.arrow.vector.complex.FixedSizeListVector; import org.apache.arrow.vector.complex.LargeListVector; import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.ListViewVector; import org.apache.arrow.vector.complex.StructVector; import org.apache.arrow.vector.holders.IntervalDayHolder; import org.apache.arrow.vector.types.Types; @@ -728,4 +730,34 @@ public static void setVector(StructVector vector, Map> val } vector.setValueCount(valueCount); } + + /** Populate values for {@link ListViewVector}. */ + public static void setVector(ListViewVector vector, List... values) { + vector.allocateNewSafe(); + Types.MinorType type = Types.MinorType.INT; + vector.addOrGetVector(FieldType.nullable(type.getType())); + + IntVector dataVector = (IntVector) vector.getDataVector(); + dataVector.allocateNew(); + + // set underlying vectors + int curPos = 0; + for (int i = 0; i < values.length; i++) { + vector.getOffsetBuffer().setInt((long) i * BaseRepeatedValueViewVector.OFFSET_WIDTH, curPos); + if (values[i] == null) { + BitVectorHelper.unsetBit(vector.getValidityBuffer(), i); + } else { + BitVectorHelper.setBit(vector.getValidityBuffer(), i); + for (int value : values[i]) { + dataVector.setSafe(curPos, value); + curPos += 1; + } + } + vector + .getSizeBuffer() + .setInt((long) i * BaseRepeatedValueViewVector.SIZE_WIDTH, values[i].size()); + } + dataVector.setValueCount(curPos); + vector.setValueCount(values.length); + } } From 3193b95ae757750669f6181441171fd84240c749 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 1 Aug 2024 17:11:22 +0200 Subject: [PATCH 54/73] GH-43514: [Python] Deprecate passing build flags to setup.py (#43515) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change As mentioned in https://github.com/apache/arrow/pull/41494#issuecomment-2092829903 (while refactoring how to specify to the pyarrow build which components to build, i.e. to let it follow the Arrow C++ components by default), we do have a "feature" that you can specify which components to build directly to setup.py, like `python setup.py build_ext --with-parquet`. This is currently not used in our own codebase, and is also not documented anymore, but we did document it in the past. In general calling setup.py directly is not recommended (although for development installs, it is still useful), furthermore there are alternatives to those flags (relying on Arrow C++ or setting an environment variable), and this would go away anyhow in case we would move away from setuptools at some point. So I think it is better to deprecate those options. ### What changes are included in this PR? Whenever a user passes such a `--with-` flag, a warning is raised. ### Are these changes tested? Tested it locally ### Are there any user-facing changes? Only for developers building pyarrow from source, they have to potentially update their build instructions. * GitHub Issue: #43514 Lead-authored-by: Joris Van den Bossche Co-authored-by: Raúl Cumplido Signed-off-by: Raúl Cumplido --- python/setup.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/python/setup.py b/python/setup.py index 11cd7028023be..2cef1c2d31cf1 100755 --- a/python/setup.py +++ b/python/setup.py @@ -24,6 +24,7 @@ import re import shlex import sys +import warnings if sys.version_info >= (3, 10): import sysconfig @@ -84,6 +85,23 @@ def strtobool(val): raise ValueError("invalid truth value %r" % (val,)) +MSG_DEPR_SETUP_BUILD_FLAGS = """ + !! + + *********************************************************************** + The '{}' flag is being passed to setup.py, but this is + deprecated. + + If a certain component is available in Arrow C++, it will automatically + be enabled for the PyArrow build as well. If you want to force the + build of a certain component, you can still use the + PYARROW_WITH_$COMPONENT environment variable. + *********************************************************************** + + !! +""" + + class build_ext(_build_ext): _found_names = () @@ -258,9 +276,16 @@ def append_cmake_bool(value, varname): varname, 'on' if value else 'off')) def append_cmake_component(flag, varname): - # only pass this to cmake is the user pass the --with-component + # only pass this to cmake if the user pass the --with-component # flag to setup.py build_ext if flag is not None: + flag_name = ( + "--with-" + + varname.removeprefix("PYARROW_").lower().replace("_", "-")) + warnings.warn( + MSG_DEPR_SETUP_BUILD_FLAGS.format(flag_name), + UserWarning, stacklevel=2 + ) append_cmake_bool(flag, varname) if self.cmake_generator: From 510cb7f7114c6df4528a8072a001c999899c8393 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 1 Aug 2024 13:39:34 -0400 Subject: [PATCH 55/73] MINOR: [JS] Bump rollup from 4.18.0 to 4.19.2 in /js (#43528) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [rollup](https://github.com/rollup/rollup) from 4.18.0 to 4.19.2.

Release notes

Sourced from rollup's releases.

v4.19.2

4.19.2

2024-08-01

Bug Fixes

  • Avoid "cannot get value of null" error when using optional chaining with namespaces (#5597)

Pull Requests

v4.19.1

4.19.1

2024-07-27

Bug Fixes

  • Do not remove parantheses when tree-shaking logical expressions (#5584)
  • Do not ignore side effects in calls left of an optional chaining operator (#5589)

Pull Requests

v4.19.0

4.19.0

2024-07-20

Features

  • Implement support for decorators (#5562)

Bug Fixes

  • Improve soucemap generation when tree-shaking logical expressions (#5581)

Pull Requests

... (truncated)

Changelog

Sourced from rollup's changelog.

4.19.2

2024-08-01

Bug Fixes

  • Avoid "cannot get value of null" error when using optional chaining with namespaces (#5597)

Pull Requests

4.19.1

2024-07-27

Bug Fixes

  • Do not remove parantheses when tree-shaking logical expressions (#5584)
  • Do not ignore side effects in calls left of an optional chaining operator (#5589)

Pull Requests

4.19.0

2024-07-20

Features

  • Implement support for decorators (#5562)

Bug Fixes

  • Improve soucemap generation when tree-shaking logical expressions (#5581)

Pull Requests

... (truncated)

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=rollup&package-manager=npm_and_yarn&previous-version=4.18.0&new-version=4.19.2)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- js/package.json | 2 +- js/yarn.lock | 198 ++++++++++++++++++++++++------------------------ 2 files changed, 100 insertions(+), 100 deletions(-) diff --git a/js/package.json b/js/package.json index 4edff4d363183..c41834bf561b2 100644 --- a/js/package.json +++ b/js/package.json @@ -102,7 +102,7 @@ "mkdirp": "3.0.1", "multistream": "4.1.0", "regenerator-runtime": "0.14.1", - "rollup": "4.18.0", + "rollup": "4.19.2", "rxjs": "7.8.1", "ts-jest": "29.1.4", "ts-node": "10.9.2", diff --git a/js/yarn.lock b/js/yarn.lock index 37ef0d09ca005..eab285216d21c 100644 --- a/js/yarn.lock +++ b/js/yarn.lock @@ -1040,85 +1040,85 @@ estree-walker "^2.0.2" picomatch "^2.3.1" -"@rollup/rollup-android-arm-eabi@4.18.0": - version "4.18.0" - resolved "https://registry.yarnpkg.com/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.18.0.tgz#bbd0e616b2078cd2d68afc9824d1fadb2f2ffd27" - integrity sha512-Tya6xypR10giZV1XzxmH5wr25VcZSncG0pZIjfePT0OVBvqNEurzValetGNarVrGiq66EBVAFn15iYX4w6FKgQ== - -"@rollup/rollup-android-arm64@4.18.0": - version "4.18.0" - resolved "https://registry.yarnpkg.com/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.18.0.tgz#97255ef6384c5f73f4800c0de91f5f6518e21203" - integrity sha512-avCea0RAP03lTsDhEyfy+hpfr85KfyTctMADqHVhLAF3MlIkq83CP8UfAHUssgXTYd+6er6PaAhx/QGv4L1EiA== - -"@rollup/rollup-darwin-arm64@4.18.0": - version "4.18.0" - resolved "https://registry.yarnpkg.com/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.18.0.tgz#b6dd74e117510dfe94541646067b0545b42ff096" - integrity sha512-IWfdwU7KDSm07Ty0PuA/W2JYoZ4iTj3TUQjkVsO/6U+4I1jN5lcR71ZEvRh52sDOERdnNhhHU57UITXz5jC1/w== - -"@rollup/rollup-darwin-x64@4.18.0": - version "4.18.0" - resolved "https://registry.yarnpkg.com/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.18.0.tgz#e07d76de1cec987673e7f3d48ccb8e106d42c05c" - integrity sha512-n2LMsUz7Ynu7DoQrSQkBf8iNrjOGyPLrdSg802vk6XT3FtsgX6JbE8IHRvposskFm9SNxzkLYGSq9QdpLYpRNA== - -"@rollup/rollup-linux-arm-gnueabihf@4.18.0": - version "4.18.0" - resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.18.0.tgz#9f1a6d218b560c9d75185af4b8bb42f9f24736b8" - integrity sha512-C/zbRYRXFjWvz9Z4haRxcTdnkPt1BtCkz+7RtBSuNmKzMzp3ZxdM28Mpccn6pt28/UWUCTXa+b0Mx1k3g6NOMA== - -"@rollup/rollup-linux-arm-musleabihf@4.18.0": - version "4.18.0" - resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.18.0.tgz#53618b92e6ffb642c7b620e6e528446511330549" - integrity sha512-l3m9ewPgjQSXrUMHg93vt0hYCGnrMOcUpTz6FLtbwljo2HluS4zTXFy2571YQbisTnfTKPZ01u/ukJdQTLGh9A== - -"@rollup/rollup-linux-arm64-gnu@4.18.0": - version "4.18.0" - resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.18.0.tgz#99a7ba5e719d4f053761a698f7b52291cefba577" - integrity sha512-rJ5D47d8WD7J+7STKdCUAgmQk49xuFrRi9pZkWoRD1UeSMakbcepWXPF8ycChBoAqs1pb2wzvbY6Q33WmN2ftw== - -"@rollup/rollup-linux-arm64-musl@4.18.0": - version "4.18.0" - resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.18.0.tgz#f53db99a45d9bc00ce94db8a35efa7c3c144a58c" - integrity sha512-be6Yx37b24ZwxQ+wOQXXLZqpq4jTckJhtGlWGZs68TgdKXJgw54lUUoFYrg6Zs/kjzAQwEwYbp8JxZVzZLRepQ== - -"@rollup/rollup-linux-powerpc64le-gnu@4.18.0": - version "4.18.0" - resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-powerpc64le-gnu/-/rollup-linux-powerpc64le-gnu-4.18.0.tgz#cbb0837408fe081ce3435cf3730e090febafc9bf" - integrity sha512-hNVMQK+qrA9Todu9+wqrXOHxFiD5YmdEi3paj6vP02Kx1hjd2LLYR2eaN7DsEshg09+9uzWi2W18MJDlG0cxJA== - -"@rollup/rollup-linux-riscv64-gnu@4.18.0": - version "4.18.0" - resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.18.0.tgz#8ed09c1d1262ada4c38d791a28ae0fea28b80cc9" - integrity sha512-ROCM7i+m1NfdrsmvwSzoxp9HFtmKGHEqu5NNDiZWQtXLA8S5HBCkVvKAxJ8U+CVctHwV2Gb5VUaK7UAkzhDjlg== - -"@rollup/rollup-linux-s390x-gnu@4.18.0": - version "4.18.0" - resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.18.0.tgz#938138d3c8e0c96f022252a28441dcfb17afd7ec" - integrity sha512-0UyyRHyDN42QL+NbqevXIIUnKA47A+45WyasO+y2bGJ1mhQrfrtXUpTxCOrfxCR4esV3/RLYyucGVPiUsO8xjg== - -"@rollup/rollup-linux-x64-gnu@4.18.0": - version "4.18.0" - resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.18.0.tgz#1a7481137a54740bee1ded4ae5752450f155d942" - integrity sha512-xuglR2rBVHA5UsI8h8UbX4VJ470PtGCf5Vpswh7p2ukaqBGFTnsfzxUBetoWBWymHMxbIG0Cmx7Y9qDZzr648w== - -"@rollup/rollup-linux-x64-musl@4.18.0": - version "4.18.0" - resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.18.0.tgz#f1186afc601ac4f4fc25fac4ca15ecbee3a1874d" - integrity sha512-LKaqQL9osY/ir2geuLVvRRs+utWUNilzdE90TpyoX0eNqPzWjRm14oMEE+YLve4k/NAqCdPkGYDaDF5Sw+xBfg== - -"@rollup/rollup-win32-arm64-msvc@4.18.0": - version "4.18.0" - resolved "https://registry.yarnpkg.com/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.18.0.tgz#ed6603e93636a96203c6915be4117245c1bd2daf" - integrity sha512-7J6TkZQFGo9qBKH0pk2cEVSRhJbL6MtfWxth7Y5YmZs57Pi+4x6c2dStAUvaQkHQLnEQv1jzBUW43GvZW8OFqA== - -"@rollup/rollup-win32-ia32-msvc@4.18.0": - version "4.18.0" - resolved "https://registry.yarnpkg.com/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.18.0.tgz#14e0b404b1c25ebe6157a15edb9c46959ba74c54" - integrity sha512-Txjh+IxBPbkUB9+SXZMpv+b/vnTEtFyfWZgJ6iyCmt2tdx0OF5WhFowLmnh8ENGNpfUlUZkdI//4IEmhwPieNg== - -"@rollup/rollup-win32-x64-msvc@4.18.0": - version "4.18.0" - resolved "https://registry.yarnpkg.com/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.18.0.tgz#5d694d345ce36b6ecf657349e03eb87297e68da4" - integrity sha512-UOo5FdvOL0+eIVTgS4tIdbW+TtnBLWg1YBCcU2KWM7nuNwRz9bksDX1bekJJCpu25N1DVWaCwnT39dVQxzqS8g== +"@rollup/rollup-android-arm-eabi@4.19.2": + version "4.19.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.19.2.tgz#6b991cb44bf69e50163528ea85bed545330ba821" + integrity sha512-OHflWINKtoCFSpm/WmuQaWW4jeX+3Qt3XQDepkkiFTsoxFc5BpF3Z5aDxFZgBqRjO6ATP5+b1iilp4kGIZVWlA== + +"@rollup/rollup-android-arm64@4.19.2": + version "4.19.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.19.2.tgz#5d3c8c2f9742d62ba258cc378bd2d4720f0c431c" + integrity sha512-k0OC/b14rNzMLDOE6QMBCjDRm3fQOHAL8Ldc9bxEWvMo4Ty9RY6rWmGetNTWhPo+/+FNd1lsQYRd0/1OSix36A== + +"@rollup/rollup-darwin-arm64@4.19.2": + version "4.19.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.19.2.tgz#8eac8682a34a705bb6a57eb3e739fd6bbedfabed" + integrity sha512-IIARRgWCNWMTeQH+kr/gFTHJccKzwEaI0YSvtqkEBPj7AshElFq89TyreKNFAGh5frLfDCbodnq+Ye3dqGKPBw== + +"@rollup/rollup-darwin-x64@4.19.2": + version "4.19.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.19.2.tgz#70a9953fc624bd7f645901f4250f6b5807ac7e92" + integrity sha512-52udDMFDv54BTAdnw+KXNF45QCvcJOcYGl3vQkp4vARyrcdI/cXH8VXTEv/8QWfd6Fru8QQuw1b2uNersXOL0g== + +"@rollup/rollup-linux-arm-gnueabihf@4.19.2": + version "4.19.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.19.2.tgz#8f6c4ff4c4972413ff94345080380d4e3caa3c69" + integrity sha512-r+SI2t8srMPYZeoa1w0o/AfoVt9akI1ihgazGYPQGRilVAkuzMGiTtexNZkrPkQsyFrvqq/ni8f3zOnHw4hUbA== + +"@rollup/rollup-linux-arm-musleabihf@4.19.2": + version "4.19.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.19.2.tgz#5d3c0fe5ea5ddf2feb511b3cb031df17eaa7e33d" + integrity sha512-+tYiL4QVjtI3KliKBGtUU7yhw0GMcJJuB9mLTCEauHEsqfk49gtUBXGtGP3h1LW8MbaTY6rSFIQV1XOBps1gBA== + +"@rollup/rollup-linux-arm64-gnu@4.19.2": + version "4.19.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.19.2.tgz#b7f104388b2f5624d9f8adfff10ba59af8ab8ed1" + integrity sha512-OR5DcvZiYN75mXDNQQxlQPTv4D+uNCUsmSCSY2FolLf9W5I4DSoJyg7z9Ea3TjKfhPSGgMJiey1aWvlWuBzMtg== + +"@rollup/rollup-linux-arm64-musl@4.19.2": + version "4.19.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.19.2.tgz#6d5ca6d3904309bec285ea5202d589cebb93dee4" + integrity sha512-Hw3jSfWdUSauEYFBSFIte6I8m6jOj+3vifLg8EU3lreWulAUpch4JBjDMtlKosrBzkr0kwKgL9iCfjA8L3geoA== + +"@rollup/rollup-linux-powerpc64le-gnu@4.19.2": + version "4.19.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-powerpc64le-gnu/-/rollup-linux-powerpc64le-gnu-4.19.2.tgz#4df9be1396ea9eb0ca99fd0f2e858008d7f063e3" + integrity sha512-rhjvoPBhBwVnJRq/+hi2Q3EMiVF538/o9dBuj9TVLclo9DuONqt5xfWSaE6MYiFKpo/lFPJ/iSI72rYWw5Hc7w== + +"@rollup/rollup-linux-riscv64-gnu@4.19.2": + version "4.19.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.19.2.tgz#80d63c5562915a2f8616a04251fcaee0218112b0" + integrity sha512-EAz6vjPwHHs2qOCnpQkw4xs14XJq84I81sDRGPEjKPFVPBw7fwvtwhVjcZR6SLydCv8zNK8YGFblKWd/vRmP8g== + +"@rollup/rollup-linux-s390x-gnu@4.19.2": + version "4.19.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.19.2.tgz#ef62e9bc5cc3b84fcfe96ec0a42d1989691217b3" + integrity sha512-IJSUX1xb8k/zN9j2I7B5Re6B0NNJDJ1+soezjNojhT8DEVeDNptq2jgycCOpRhyGj0+xBn7Cq+PK7Q+nd2hxLA== + +"@rollup/rollup-linux-x64-gnu@4.19.2": + version "4.19.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.19.2.tgz#6a275282a0080fee98ddd9fda0de23c4c6bafd48" + integrity sha512-OgaToJ8jSxTpgGkZSkwKE+JQGihdcaqnyHEFOSAU45utQ+yLruE1dkonB2SDI8t375wOKgNn8pQvaWY9kPzxDQ== + +"@rollup/rollup-linux-x64-musl@4.19.2": + version "4.19.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.19.2.tgz#64f0c704107e6b45b26dd8c2e1ff64246e4a1251" + integrity sha512-5V3mPpWkB066XZZBgSd1lwozBk7tmOkKtquyCJ6T4LN3mzKENXyBwWNQn8d0Ci81hvlBw5RoFgleVpL6aScLYg== + +"@rollup/rollup-win32-arm64-msvc@4.19.2": + version "4.19.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.19.2.tgz#bada17b0c5017ff58d0feba401c43ff5a646c693" + integrity sha512-ayVstadfLeeXI9zUPiKRVT8qF55hm7hKa+0N1V6Vj+OTNFfKSoUxyZvzVvgtBxqSb5URQ8sK6fhwxr9/MLmxdA== + +"@rollup/rollup-win32-ia32-msvc@4.19.2": + version "4.19.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.19.2.tgz#a716d862f6ac39d88bdb825e27f63aeb0387cd66" + integrity sha512-Mda7iG4fOLHNsPqjWSjANvNZYoW034yxgrndof0DwCy0D3FvTjeNo+HGE6oGWgvcLZNLlcp0hLEFcRs+UGsMLg== + +"@rollup/rollup-win32-x64-msvc@4.19.2": + version "4.19.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.19.2.tgz#d67206c5f2e4b2832ce360bbbde194e96d16dc51" + integrity sha512-DPi0ubYhSow/00YqmG1jWm3qt1F8aXziHc/UNy8bo9cpCacqhuWu+iSq/fp2SyEQK7iYTZ60fBU9cat3MXTjIQ== "@rollup/stream@3.0.1": version "3.0.1" @@ -6221,29 +6221,29 @@ rimraf@^3.0.2: dependencies: glob "^7.1.3" -rollup@4.18.0: - version "4.18.0" - resolved "https://registry.yarnpkg.com/rollup/-/rollup-4.18.0.tgz#497f60f0c5308e4602cf41136339fbf87d5f5dda" - integrity sha512-QmJz14PX3rzbJCN1SG4Xe/bAAX2a6NpCP8ab2vfu2GiUr8AQcr2nCV/oEO3yneFarB67zk8ShlIyWb2LGTb3Sg== +rollup@4.19.2: + version "4.19.2" + resolved "https://registry.yarnpkg.com/rollup/-/rollup-4.19.2.tgz#4985cd2028965157e8d674a70e49f33aca9038eb" + integrity sha512-6/jgnN1svF9PjNYJ4ya3l+cqutg49vOZ4rVgsDKxdl+5gpGPnByFXWGyfH9YGx9i3nfBwSu1Iyu6vGwFFA0BdQ== dependencies: "@types/estree" "1.0.5" optionalDependencies: - "@rollup/rollup-android-arm-eabi" "4.18.0" - "@rollup/rollup-android-arm64" "4.18.0" - "@rollup/rollup-darwin-arm64" "4.18.0" - "@rollup/rollup-darwin-x64" "4.18.0" - "@rollup/rollup-linux-arm-gnueabihf" "4.18.0" - "@rollup/rollup-linux-arm-musleabihf" "4.18.0" - "@rollup/rollup-linux-arm64-gnu" "4.18.0" - "@rollup/rollup-linux-arm64-musl" "4.18.0" - "@rollup/rollup-linux-powerpc64le-gnu" "4.18.0" - "@rollup/rollup-linux-riscv64-gnu" "4.18.0" - "@rollup/rollup-linux-s390x-gnu" "4.18.0" - "@rollup/rollup-linux-x64-gnu" "4.18.0" - "@rollup/rollup-linux-x64-musl" "4.18.0" - "@rollup/rollup-win32-arm64-msvc" "4.18.0" - "@rollup/rollup-win32-ia32-msvc" "4.18.0" - "@rollup/rollup-win32-x64-msvc" "4.18.0" + "@rollup/rollup-android-arm-eabi" "4.19.2" + "@rollup/rollup-android-arm64" "4.19.2" + "@rollup/rollup-darwin-arm64" "4.19.2" + "@rollup/rollup-darwin-x64" "4.19.2" + "@rollup/rollup-linux-arm-gnueabihf" "4.19.2" + "@rollup/rollup-linux-arm-musleabihf" "4.19.2" + "@rollup/rollup-linux-arm64-gnu" "4.19.2" + "@rollup/rollup-linux-arm64-musl" "4.19.2" + "@rollup/rollup-linux-powerpc64le-gnu" "4.19.2" + "@rollup/rollup-linux-riscv64-gnu" "4.19.2" + "@rollup/rollup-linux-s390x-gnu" "4.19.2" + "@rollup/rollup-linux-x64-gnu" "4.19.2" + "@rollup/rollup-linux-x64-musl" "4.19.2" + "@rollup/rollup-win32-arm64-msvc" "4.19.2" + "@rollup/rollup-win32-ia32-msvc" "4.19.2" + "@rollup/rollup-win32-x64-msvc" "4.19.2" fsevents "~2.3.2" run-parallel@^1.1.9: From 598dda6fa2684e9356078ba4def40eaf80ce3567 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 1 Aug 2024 13:39:58 -0400 Subject: [PATCH 56/73] MINOR: [JS] Bump command-line-usage from 7.0.1 to 7.0.3 in /js (#43526) Bumps [command-line-usage](https://github.com/75lb/command-line-usage) from 7.0.1 to 7.0.3.
Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=command-line-usage&package-manager=npm_and_yarn&previous-version=7.0.1&new-version=7.0.3)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- js/yarn.lock | 41 +++++++++-------------------------------- 1 file changed, 9 insertions(+), 32 deletions(-) diff --git a/js/yarn.lock b/js/yarn.lock index eab285216d21c..fc7dbcae3a20d 100644 --- a/js/yarn.lock +++ b/js/yarn.lock @@ -2,14 +2,6 @@ # yarn lockfile v1 -"@75lb/deep-merge@^1.1.1": - version "1.1.1" - resolved "https://registry.yarnpkg.com/@75lb/deep-merge/-/deep-merge-1.1.1.tgz#3b06155b90d34f5f8cc2107d796f1853ba02fd6d" - integrity sha512-xvgv6pkMGBA6GwdyJbNAnDmfAIR/DfWhrj9jgWh3TY7gRm3KO46x/GPjRg6wJ0nOepwqrNxFfojebh0Df4h4Tw== - dependencies: - lodash.assignwith "^4.2.0" - typical "^7.1.1" - "@aashutoshrathi/word-wrap@^1.2.3": version "1.2.6" resolved "https://registry.yarnpkg.com/@aashutoshrathi/word-wrap/-/word-wrap-1.2.6.tgz#bd9154aec9983f77b3a034ecaa015c2e4201f6cf" @@ -2494,14 +2486,14 @@ command-line-args@^5.2.1: lodash.camelcase "^4.3.0" typical "^4.0.0" -command-line-usage@^7.0.0, command-line-usage@^7.0.1: - version "7.0.1" - resolved "https://registry.yarnpkg.com/command-line-usage/-/command-line-usage-7.0.1.tgz#e540afef4a4f3bc501b124ffde33956309100655" - integrity sha512-NCyznE//MuTjwi3y84QVUGEOT+P5oto1e1Pk/jFPVdPPfsG03qpTIl3yw6etR+v73d0lXsoojRpvbru2sqePxQ== +command-line-usage@^7.0.1: + version "7.0.3" + resolved "https://registry.yarnpkg.com/command-line-usage/-/command-line-usage-7.0.3.tgz#6bce992354f6af10ecea2b631bfdf0c8b3bfaea3" + integrity sha512-PqMLy5+YGwhMh1wS04mVG44oqDsgyLRSKJBdOo1bnYhMKBW65gZF1dRp2OZRhiTjgUHljy99qkO7bsctLaw35Q== dependencies: array-back "^6.2.2" chalk-template "^0.4.0" - table-layout "^3.0.0" + table-layout "^4.1.0" typical "^7.1.1" commander@^2.20.0: @@ -5054,11 +5046,6 @@ locate-path@^6.0.0: dependencies: p-locate "^5.0.0" -lodash.assignwith@^4.2.0: - version "4.2.0" - resolved "https://registry.yarnpkg.com/lodash.assignwith/-/lodash.assignwith-4.2.0.tgz#127a97f02adc41751a954d24b0de17e100e038eb" - integrity sha512-ZznplvbvtjK2gMvnQ1BR/zqPFZmS6jbK4p+6Up4xcRYA7yMIwxHCfbTcrYxXKzzqLsQ05eJPVznEW3tuwV7k1g== - lodash.camelcase@^4.3.0: version "4.3.0" resolved "https://registry.yarnpkg.com/lodash.camelcase/-/lodash.camelcase-4.3.0.tgz#b28aa6288a2b9fc651035c7711f65ab6190331a6" @@ -6569,11 +6556,6 @@ stream-exhaust@^1.0.1, stream-exhaust@^1.0.2: resolved "https://registry.yarnpkg.com/stream-exhaust/-/stream-exhaust-1.0.2.tgz#acdac8da59ef2bc1e17a2c0ccf6c320d120e555d" integrity sha512-b/qaq/GlBK5xaq1yrK9/zFcyRSTNxmcZwFLGSTG0mXgZl/4Z6GgiyYOXOvY7N3eEvFRAG1bkDRz5EPGSvPYQlw== -stream-read-all@^3.0.1: - version "3.0.1" - resolved "https://registry.yarnpkg.com/stream-read-all/-/stream-read-all-3.0.1.tgz#60762ae45e61d93ba0978cda7f3913790052ad96" - integrity sha512-EWZT9XOceBPlVJRrYcykW8jyRSZYbkb/0ZK36uLEmoWVO5gxBOnntNTseNzfREsqxqdfEGQrD8SXQ3QWbBmq8A== - stream-shift@^1.0.0: version "1.0.3" resolved "https://registry.yarnpkg.com/stream-shift/-/stream-shift-1.0.3.tgz#85b8fab4d71010fc3ba8772e8046cc49b8a3864b" @@ -6750,17 +6732,12 @@ sver-compat@^1.5.0: es6-iterator "^2.0.1" es6-symbol "^3.1.1" -table-layout@^3.0.0: - version "3.0.2" - resolved "https://registry.yarnpkg.com/table-layout/-/table-layout-3.0.2.tgz#69c2be44388a5139b48c59cf21e73b488021769a" - integrity sha512-rpyNZYRw+/C+dYkcQ3Pr+rLxW4CfHpXjPDnG7lYhdRoUcZTUt+KEsX+94RGp/aVp/MQU35JCITv2T/beY4m+hw== +table-layout@^4.1.0: + version "4.1.1" + resolved "https://registry.yarnpkg.com/table-layout/-/table-layout-4.1.1.tgz#0f72965de1a5c0c1419c9ba21cae4e73a2f73a42" + integrity sha512-iK5/YhZxq5GO5z8wb0bY1317uDF3Zjpha0QFFLA8/trAoiLbQD0HUbMesEaxyzUgDxi2QlcbM8IvqOlEjgoXBA== dependencies: - "@75lb/deep-merge" "^1.1.1" array-back "^6.2.2" - command-line-args "^5.2.1" - command-line-usage "^7.0.0" - stream-read-all "^3.0.1" - typical "^7.1.1" wordwrapjs "^5.1.0" tapable@^2.1.1, tapable@^2.2.0: From d0b37a1ffd5bdd7605a8d6191771a24621808639 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 1 Aug 2024 13:40:09 -0400 Subject: [PATCH 57/73] MINOR: [JS] Bump eslint-plugin-unicorn from 54.0.0 to 55.0.0 in /js (#43523) Bumps [eslint-plugin-unicorn](https://github.com/sindresorhus/eslint-plugin-unicorn) from 54.0.0 to 55.0.0.
Release notes

Sourced from eslint-plugin-unicorn's releases.

v55.0.0

New rules

Improvements

  • Add TypeScript types (#2382) 68e0f13

Fixes

  • no-single-promise-in-promise-methods: Remove broken autofix for Promise.all() (#2386) 8d28b6e
  • prefer-node-protocol: Ignore Bun modules (#2384) a45b24a
  • no-negation-in-equality-check: Ignore boolean type casting (#2379) 37e00dd

https://github.com/sindresorhus/eslint-plugin-unicorn/compare/v54.0.0...v55.0.0

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=eslint-plugin-unicorn&package-manager=npm_and_yarn&previous-version=54.0.0&new-version=55.0.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- js/package.json | 2 +- js/yarn.lock | 49 ++++++++++--------------------------------------- 2 files changed, 11 insertions(+), 40 deletions(-) diff --git a/js/package.json b/js/package.json index c41834bf561b2..caa033aebc5dd 100644 --- a/js/package.json +++ b/js/package.json @@ -83,7 +83,7 @@ "esbuild-plugin-alias": "0.2.1", "eslint": "8.57.0", "eslint-plugin-jest": "28.5.0", - "eslint-plugin-unicorn": "54.0.0", + "eslint-plugin-unicorn": "55.0.0", "gulp": "4.0.2", "glob": "10.4.1", "google-closure-compiler": "20240317.0.0", diff --git a/js/yarn.lock b/js/yarn.lock index fc7dbcae3a20d..6544439042e9b 100644 --- a/js/yarn.lock +++ b/js/yarn.lock @@ -611,21 +611,6 @@ minimatch "^3.1.2" strip-json-comments "^3.1.1" -"@eslint/eslintrc@^3.0.2": - version "3.1.0" - resolved "https://registry.yarnpkg.com/@eslint/eslintrc/-/eslintrc-3.1.0.tgz#dbd3482bfd91efa663cbe7aa1f506839868207b6" - integrity sha512-4Bfj15dVJdoy3RfZmmo86RK1Fwzn6SstsvK9JS+BaVKqC6QQQQyXekNaC+g+LKNgkQ+2VhGAzm6hO40AhMR3zQ== - dependencies: - ajv "^6.12.4" - debug "^4.3.2" - espree "^10.0.1" - globals "^14.0.0" - ignore "^5.2.0" - import-fresh "^3.2.1" - js-yaml "^4.1.0" - minimatch "^3.1.2" - strip-json-comments "^3.1.1" - "@eslint/js@8.57.0": version "8.57.0" resolved "https://registry.yarnpkg.com/@eslint/js/-/js-8.57.0.tgz#a5417ae8427873f1dd08b70b3574b453e67b5f7f" @@ -1708,7 +1693,7 @@ acorn@^6.4.1: resolved "https://registry.yarnpkg.com/acorn/-/acorn-6.4.2.tgz#35866fd710528e92de10cf06016498e47e39e1e6" integrity sha512-XtGIhXwF8YM8bJhGxG5kXgjkEuNGLTkoYqVE+KMR+aspr4KGYmKYg7yUe3KghyQ9yheNwLnjmzh/7+gfDBmHCQ== -acorn@^8.0.4, acorn@^8.11.3, acorn@^8.4.1, acorn@^8.7.1, acorn@^8.8.2, acorn@^8.9.0: +acorn@^8.0.4, acorn@^8.4.1, acorn@^8.7.1, acorn@^8.8.2, acorn@^8.9.0: version "8.11.3" resolved "https://registry.yarnpkg.com/acorn/-/acorn-8.11.3.tgz#71e0b14e13a4ec160724b38fb7b0f233b1b81d7a" integrity sha512-Y9rRfJG5jcKOE0CLisYbojUjIrIEE7AGMzA/Sm4BslANhbS+cDMpgBdcPT91oJ7OuJ9hYJBx59RjbhxVnrF8Xg== @@ -3040,18 +3025,18 @@ eslint-plugin-jest@28.5.0: dependencies: "@typescript-eslint/utils" "^6.0.0 || ^7.0.0" -eslint-plugin-unicorn@54.0.0: - version "54.0.0" - resolved "https://registry.yarnpkg.com/eslint-plugin-unicorn/-/eslint-plugin-unicorn-54.0.0.tgz#ce3ea853e8fd7ca2bda2fd6065bf065adb5d8b6d" - integrity sha512-XxYLRiYtAWiAjPv6z4JREby1TAE2byBC7wlh0V4vWDCpccOSU1KovWV//jqPXF6bq3WKxqX9rdjoRQ1EhdmNdQ== +eslint-plugin-unicorn@55.0.0: + version "55.0.0" + resolved "https://registry.yarnpkg.com/eslint-plugin-unicorn/-/eslint-plugin-unicorn-55.0.0.tgz#e2aeb397914799895702480970e7d148df5bcc7b" + integrity sha512-n3AKiVpY2/uDcGrS3+QsYDkjPfaOrNrsfQxU9nt5nitd9KuvVXrfAvgCO9DYPSfap+Gqjw9EOrXIsBp5tlHZjA== dependencies: "@babel/helper-validator-identifier" "^7.24.5" "@eslint-community/eslint-utils" "^4.4.0" - "@eslint/eslintrc" "^3.0.2" ci-info "^4.0.0" clean-regexp "^1.0.0" core-js-compat "^3.37.0" esquery "^1.5.0" + globals "^15.7.0" indent-string "^4.0.0" is-builtin-module "^3.2.1" jsesc "^3.0.2" @@ -3083,11 +3068,6 @@ eslint-visitor-keys@^3.3.0, eslint-visitor-keys@^3.4.1, eslint-visitor-keys@^3.4 resolved "https://registry.yarnpkg.com/eslint-visitor-keys/-/eslint-visitor-keys-3.4.3.tgz#0cd72fe8550e3c2eae156a96a4dddcd1c8ac5800" integrity sha512-wpc+LXeiyiisxPlEkUzU6svyS1frIO3Mgxj1fdy7Pm8Ygzguax2N3Fa/D/ag1WqbOprdI+uY6wMUl8/a2G+iag== -eslint-visitor-keys@^4.0.0: - version "4.0.0" - resolved "https://registry.yarnpkg.com/eslint-visitor-keys/-/eslint-visitor-keys-4.0.0.tgz#e3adc021aa038a2a8e0b2f8b0ce8f66b9483b1fb" - integrity sha512-OtIRv/2GyiF6o/d8K7MYKKbXrOUBIK6SfkIRM4Z0dY3w+LiQ0vy3F57m0Z71bjbyeiWFiHJ8brqnmE6H6/jEuw== - eslint@8.57.0: version "8.57.0" resolved "https://registry.yarnpkg.com/eslint/-/eslint-8.57.0.tgz#c786a6fd0e0b68941aaf624596fb987089195668" @@ -3142,15 +3122,6 @@ esniff@^2.0.1: event-emitter "^0.3.5" type "^2.7.2" -espree@^10.0.1: - version "10.0.1" - resolved "https://registry.yarnpkg.com/espree/-/espree-10.0.1.tgz#600e60404157412751ba4a6f3a2ee1a42433139f" - integrity sha512-MWkrWZbJsL2UwnjxTX3gG8FneachS/Mwg7tdGXce011sJd5b0JG54vat5KHnfSBODZ3Wvzd2WnjxyzsRoVv+ww== - dependencies: - acorn "^8.11.3" - acorn-jsx "^5.3.2" - eslint-visitor-keys "^4.0.0" - espree@^9.6.0, espree@^9.6.1: version "9.6.1" resolved "https://registry.yarnpkg.com/espree/-/espree-9.6.1.tgz#a2a17b8e434690a5432f2f8018ce71d331a48c6f" @@ -3718,10 +3689,10 @@ globals@^13.19.0: dependencies: type-fest "^0.20.2" -globals@^14.0.0: - version "14.0.0" - resolved "https://registry.yarnpkg.com/globals/-/globals-14.0.0.tgz#898d7413c29babcf6bafe56fcadded858ada724e" - integrity sha512-oahGvuMGQlPw/ivIYBjVSrWAfWLBeku5tpPE2fOPLi+WHffIWbuh2tCjhyQhTBPMf5E9jDEH4FOmTYgYwbKwtQ== +globals@^15.7.0: + version "15.9.0" + resolved "https://registry.yarnpkg.com/globals/-/globals-15.9.0.tgz#e9de01771091ffbc37db5714dab484f9f69ff399" + integrity sha512-SmSKyLLKFbSr6rptvP8izbyxJL4ILwqO9Jg23UA0sDlGlu58V59D1//I3vlc0KJphVdUR7vMjHIplYnzBxorQA== globby@^11.1.0: version "11.1.0" From e78f716d2e1859606a9d86d39b6adb095a1f9f24 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 1 Aug 2024 13:40:34 -0400 Subject: [PATCH 58/73] MINOR: [JS] Bump esbuild from 0.22.0 to 0.23.0 in /js (#43522) Bumps [esbuild](https://github.com/evanw/esbuild) from 0.22.0 to 0.23.0.
Release notes

Sourced from esbuild's releases.

v0.23.0

This release deliberately contains backwards-incompatible changes. To avoid automatically picking up releases like this, you should either be pinning the exact version of esbuild in your package.json file (recommended) or be using a version range syntax that only accepts patch upgrades such as ^0.22.0 or ~0.22.0. See npm's documentation about semver for more information.

  • Revert the recent change to avoid bundling dependencies for node (#3819)

    This release reverts the recent change in version 0.22.0 that made --packages=external the default behavior with --platform=node. The default is now back to --packages=bundle.

    I've just been made aware that Amazon doesn't pin their dependencies in their "AWS CDK" product, which means that whenever esbuild publishes a new release, many people (potentially everyone?) using their SDK around the world instantly starts using it without Amazon checking that it works first. This change in version 0.22.0 happened to break their SDK. I'm amazed that things haven't broken before this point. This revert attempts to avoid these problems for Amazon's customers. Hopefully Amazon will pin their dependencies in the future.

    In addition, this is probably a sign that esbuild is used widely enough that it now needs to switch to a more complicated release model. I may have esbuild use a beta channel model for further development.

  • Fix preserving collapsed JSX whitespace (#3818)

    When transformed, certain whitespace inside JSX elements is ignored completely if it collapses to an empty string. However, the whitespace should only be ignored if the JSX is being transformed, not if it's being preserved. This release fixes a bug where esbuild was previously incorrectly ignoring collapsed whitespace with --jsx=preserve. Here is an example:

    // Original code
    <Foo>
      <Bar />
    </Foo>
    

    // Old output (with --jsx=preserve)
    <Foo><Bar /></Foo>;

    // New output (with --jsx=preserve)
    <Foo>
    <Bar />
    </Foo>;

Changelog

Sourced from esbuild's changelog.

0.23.0

This release deliberately contains backwards-incompatible changes. To avoid automatically picking up releases like this, you should either be pinning the exact version of esbuild in your package.json file (recommended) or be using a version range syntax that only accepts patch upgrades such as ^0.22.0 or ~0.22.0. See npm's documentation about semver for more information.

  • Revert the recent change to avoid bundling dependencies for node (#3819)

    This release reverts the recent change in version 0.22.0 that made --packages=external the default behavior with --platform=node. The default is now back to --packages=bundle.

    I've just been made aware that Amazon doesn't pin their dependencies in their "AWS CDK" product, which means that whenever esbuild publishes a new release, many people (potentially everyone?) using their SDK around the world instantly starts using it without Amazon checking that it works first. This change in version 0.22.0 happened to break their SDK. I'm amazed that things haven't broken before this point. This revert attempts to avoid these problems for Amazon's customers. Hopefully Amazon will pin their dependencies in the future.

    In addition, this is probably a sign that esbuild is used widely enough that it now needs to switch to a more complicated release model. I may have esbuild use a beta channel model for further development.

  • Fix preserving collapsed JSX whitespace (#3818)

    When transformed, certain whitespace inside JSX elements is ignored completely if it collapses to an empty string. However, the whitespace should only be ignored if the JSX is being transformed, not if it's being preserved. This release fixes a bug where esbuild was previously incorrectly ignoring collapsed whitespace with --jsx=preserve. Here is an example:

    // Original code
    <Foo>
      <Bar />
    </Foo>
    

    // Old output (with --jsx=preserve)
    <Foo><Bar /></Foo>;

    // New output (with --jsx=preserve)
    <Foo>
    <Bar />
    </Foo>;

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=esbuild&package-manager=npm_and_yarn&previous-version=0.22.0&new-version=0.23.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- js/package.json | 2 +- js/yarn.lock | 248 ++++++++++++++++++++++++------------------------ 2 files changed, 125 insertions(+), 125 deletions(-) diff --git a/js/package.json b/js/package.json index caa033aebc5dd..cbf0670e018b6 100644 --- a/js/package.json +++ b/js/package.json @@ -79,7 +79,7 @@ "cross-env": "7.0.3", "del": "7.1.0", "del-cli": "5.1.0", - "esbuild": "0.22.0", + "esbuild": "0.23.0", "esbuild-plugin-alias": "0.2.1", "eslint": "8.57.0", "eslint-plugin-jest": "28.5.0", diff --git a/js/yarn.lock b/js/yarn.lock index 6544439042e9b..dc1fc99a0ecf4 100644 --- a/js/yarn.lock +++ b/js/yarn.lock @@ -354,235 +354,235 @@ resolved "https://registry.yarnpkg.com/@esbuild/aix-ppc64/-/aix-ppc64-0.21.5.tgz#c7184a326533fcdf1b8ee0733e21c713b975575f" integrity sha512-1SDgH6ZSPTlggy1yI6+Dbkiz8xzpHJEVAlF/AM1tHPLsf5STom9rwtjE4hKAF20FfXXNTFqEYXyJNWh1GiZedQ== -"@esbuild/aix-ppc64@0.22.0": - version "0.22.0" - resolved "https://registry.yarnpkg.com/@esbuild/aix-ppc64/-/aix-ppc64-0.22.0.tgz#6ff1ec509335ffbaee3fc4a5a11373d6f029b2c4" - integrity sha512-uvQR2crZ/zgzSHDvdygHyNI+ze9zwS8mqz0YtGXotSqvEE0UkYE9s+FZKQNTt1VtT719mfP3vHrUdCpxBNQZhQ== +"@esbuild/aix-ppc64@0.23.0": + version "0.23.0" + resolved "https://registry.yarnpkg.com/@esbuild/aix-ppc64/-/aix-ppc64-0.23.0.tgz#145b74d5e4a5223489cabdc238d8dad902df5259" + integrity sha512-3sG8Zwa5fMcA9bgqB8AfWPQ+HFke6uD3h1s3RIwUNK8EG7a4buxvuFTs3j1IMs2NXAk9F30C/FF4vxRgQCcmoQ== "@esbuild/android-arm64@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/android-arm64/-/android-arm64-0.21.5.tgz#09d9b4357780da9ea3a7dfb833a1f1ff439b4052" integrity sha512-c0uX9VAUBQ7dTDCjq+wdyGLowMdtR/GoC2U5IYk/7D1H1JYC0qseD7+11iMP2mRLN9RcCMRcjC4YMclCzGwS/A== -"@esbuild/android-arm64@0.22.0": - version "0.22.0" - resolved "https://registry.yarnpkg.com/@esbuild/android-arm64/-/android-arm64-0.22.0.tgz#a02ef8650fe5ce17807c9f3229a36d326d2b07ea" - integrity sha512-UKhPb3o2gAB/bfXcl58ZXTn1q2oVu1rEu/bKrCtmm+Nj5MKUbrOwR5WAixE2v+lk0amWuwPvhnPpBRLIGiq7ig== +"@esbuild/android-arm64@0.23.0": + version "0.23.0" + resolved "https://registry.yarnpkg.com/@esbuild/android-arm64/-/android-arm64-0.23.0.tgz#453bbe079fc8d364d4c5545069e8260228559832" + integrity sha512-EuHFUYkAVfU4qBdyivULuu03FhJO4IJN9PGuABGrFy4vUuzk91P2d+npxHcFdpUnfYKy0PuV+n6bKIpHOB3prQ== "@esbuild/android-arm@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/android-arm/-/android-arm-0.21.5.tgz#9b04384fb771926dfa6d7ad04324ecb2ab9b2e28" integrity sha512-vCPvzSjpPHEi1siZdlvAlsPxXl7WbOVUBBAowWug4rJHb68Ox8KualB+1ocNvT5fjv6wpkX6o/iEpbDrf68zcg== -"@esbuild/android-arm@0.22.0": - version "0.22.0" - resolved "https://registry.yarnpkg.com/@esbuild/android-arm/-/android-arm-0.22.0.tgz#dd26ec407db736eee0eb060195a43aa13f618013" - integrity sha512-PBnyP+r8vJE4ifxsWys9l+Mc2UY/yYZOpX82eoyGISXXb3dRr0M21v+s4fgRKWMFPMSf/iyowqPW/u7ScSUkjQ== +"@esbuild/android-arm@0.23.0": + version "0.23.0" + resolved "https://registry.yarnpkg.com/@esbuild/android-arm/-/android-arm-0.23.0.tgz#26c806853aa4a4f7e683e519cd9d68e201ebcf99" + integrity sha512-+KuOHTKKyIKgEEqKbGTK8W7mPp+hKinbMBeEnNzjJGyFcWsfrXjSTNluJHCY1RqhxFurdD8uNXQDei7qDlR6+g== "@esbuild/android-x64@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/android-x64/-/android-x64-0.21.5.tgz#29918ec2db754cedcb6c1b04de8cd6547af6461e" integrity sha512-D7aPRUUNHRBwHxzxRvp856rjUHRFW1SdQATKXH2hqA0kAZb1hKmi02OpYRacl0TxIGz/ZmXWlbZgjwWYaCakTA== -"@esbuild/android-x64@0.22.0": - version "0.22.0" - resolved "https://registry.yarnpkg.com/@esbuild/android-x64/-/android-x64-0.22.0.tgz#f02771a20be264ccc22478dcc7de8f2bde858af8" - integrity sha512-IjTYtvIrjhR41Ijy2dDPgYjQHWG/x/A4KXYbs1fiU3efpRdoxMChK3oEZV6GPzVEzJqxFgcuBaiX1kwEvWUxSw== +"@esbuild/android-x64@0.23.0": + version "0.23.0" + resolved "https://registry.yarnpkg.com/@esbuild/android-x64/-/android-x64-0.23.0.tgz#1e51af9a6ac1f7143769f7ee58df5b274ed202e6" + integrity sha512-WRrmKidLoKDl56LsbBMhzTTBxrsVwTKdNbKDalbEZr0tcsBgCLbEtoNthOW6PX942YiYq8HzEnb4yWQMLQuipQ== "@esbuild/darwin-arm64@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/darwin-arm64/-/darwin-arm64-0.21.5.tgz#e495b539660e51690f3928af50a76fb0a6ccff2a" integrity sha512-DwqXqZyuk5AiWWf3UfLiRDJ5EDd49zg6O9wclZ7kUMv2WRFr4HKjXp/5t8JZ11QbQfUS6/cRCKGwYhtNAY88kQ== -"@esbuild/darwin-arm64@0.22.0": - version "0.22.0" - resolved "https://registry.yarnpkg.com/@esbuild/darwin-arm64/-/darwin-arm64-0.22.0.tgz#d905f2b951aeba328dd02e3a09f86b5d4e5e6741" - integrity sha512-mqt+Go4y9wRvEz81bhKd9RpHsQR1LwU8Xm6jZRUV/xpM7cIQFbFH6wBCLPTNsdELBvfoHeumud7X78jQQJv2TA== +"@esbuild/darwin-arm64@0.23.0": + version "0.23.0" + resolved "https://registry.yarnpkg.com/@esbuild/darwin-arm64/-/darwin-arm64-0.23.0.tgz#d996187a606c9534173ebd78c58098a44dd7ef9e" + integrity sha512-YLntie/IdS31H54Ogdn+v50NuoWF5BDkEUFpiOChVa9UnKpftgwzZRrI4J132ETIi+D8n6xh9IviFV3eXdxfow== "@esbuild/darwin-x64@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/darwin-x64/-/darwin-x64-0.21.5.tgz#c13838fa57372839abdddc91d71542ceea2e1e22" integrity sha512-se/JjF8NlmKVG4kNIuyWMV/22ZaerB+qaSi5MdrXtd6R08kvs2qCN4C09miupktDitvh8jRFflwGFBQcxZRjbw== -"@esbuild/darwin-x64@0.22.0": - version "0.22.0" - resolved "https://registry.yarnpkg.com/@esbuild/darwin-x64/-/darwin-x64-0.22.0.tgz#d07b4fe501fe9985590285b2790039ed4743f86e" - integrity sha512-vTaTQ9OgYc3VTaWtOE5pSuDT6H3d/qSRFRfSBbnxFfzAvYoB3pqKXA0LEbi/oT8GUOEAutspfRMqPj2ezdFaMw== +"@esbuild/darwin-x64@0.23.0": + version "0.23.0" + resolved "https://registry.yarnpkg.com/@esbuild/darwin-x64/-/darwin-x64-0.23.0.tgz#30c8f28a7ef4e32fe46501434ebe6b0912e9e86c" + integrity sha512-IMQ6eme4AfznElesHUPDZ+teuGwoRmVuuixu7sv92ZkdQcPbsNHzutd+rAfaBKo8YK3IrBEi9SLLKWJdEvJniQ== "@esbuild/freebsd-arm64@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/freebsd-arm64/-/freebsd-arm64-0.21.5.tgz#646b989aa20bf89fd071dd5dbfad69a3542e550e" integrity sha512-5JcRxxRDUJLX8JXp/wcBCy3pENnCgBR9bN6JsY4OmhfUtIHe3ZW0mawA7+RDAcMLrMIZaf03NlQiX9DGyB8h4g== -"@esbuild/freebsd-arm64@0.22.0": - version "0.22.0" - resolved "https://registry.yarnpkg.com/@esbuild/freebsd-arm64/-/freebsd-arm64-0.22.0.tgz#4251e0a14716116f4fa7e22d908f47408b6c2fb5" - integrity sha512-0e1ZgoobJzaGnR4reD7I9rYZ7ttqdh1KPvJWnquUoDJhL0rYwdneeLailBzd2/4g/U5p4e5TIHEWa68NF2hFpQ== +"@esbuild/freebsd-arm64@0.23.0": + version "0.23.0" + resolved "https://registry.yarnpkg.com/@esbuild/freebsd-arm64/-/freebsd-arm64-0.23.0.tgz#30f4fcec8167c08a6e8af9fc14b66152232e7fb4" + integrity sha512-0muYWCng5vqaxobq6LB3YNtevDFSAZGlgtLoAc81PjUfiFz36n4KMpwhtAd4he8ToSI3TGyuhyx5xmiWNYZFyw== "@esbuild/freebsd-x64@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/freebsd-x64/-/freebsd-x64-0.21.5.tgz#aa615cfc80af954d3458906e38ca22c18cf5c261" integrity sha512-J95kNBj1zkbMXtHVH29bBriQygMXqoVQOQYA+ISs0/2l3T9/kj42ow2mpqerRBxDJnmkUDCaQT/dfNXWX/ZZCQ== -"@esbuild/freebsd-x64@0.22.0": - version "0.22.0" - resolved "https://registry.yarnpkg.com/@esbuild/freebsd-x64/-/freebsd-x64-0.22.0.tgz#7dbd35616a71f8a9b61a9435c5a79d87fc0b2f1a" - integrity sha512-BFgyYwlCwRWyPQJtkzqq2p6pJbiiWgp0P9PNf7a5FQ1itKY4czPuOMAlFVItirSmEpRPCeImuwePNScZS0pL5Q== +"@esbuild/freebsd-x64@0.23.0": + version "0.23.0" + resolved "https://registry.yarnpkg.com/@esbuild/freebsd-x64/-/freebsd-x64-0.23.0.tgz#1003a6668fe1f5d4439e6813e5b09a92981bc79d" + integrity sha512-XKDVu8IsD0/q3foBzsXGt/KjD/yTKBCIwOHE1XwiXmrRwrX6Hbnd5Eqn/WvDekddK21tfszBSrE/WMaZh+1buQ== "@esbuild/linux-arm64@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/linux-arm64/-/linux-arm64-0.21.5.tgz#70ac6fa14f5cb7e1f7f887bcffb680ad09922b5b" integrity sha512-ibKvmyYzKsBeX8d8I7MH/TMfWDXBF3db4qM6sy+7re0YXya+K1cem3on9XgdT2EQGMu4hQyZhan7TeQ8XkGp4Q== -"@esbuild/linux-arm64@0.22.0": - version "0.22.0" - resolved "https://registry.yarnpkg.com/@esbuild/linux-arm64/-/linux-arm64-0.22.0.tgz#77cded446dd0c3b723d272e0243b3d9ddb3cb46e" - integrity sha512-V/K2rctCUgC0PCXpN7AqT4hoazXKgIYugFGu/myk2+pfe6jTW2guz/TBwq4cZ7ESqusR/IzkcQaBkcjquuBWsw== +"@esbuild/linux-arm64@0.23.0": + version "0.23.0" + resolved "https://registry.yarnpkg.com/@esbuild/linux-arm64/-/linux-arm64-0.23.0.tgz#3b9a56abfb1410bb6c9138790f062587df3e6e3a" + integrity sha512-j1t5iG8jE7BhonbsEg5d9qOYcVZv/Rv6tghaXM/Ug9xahM0nX/H2gfu6X6z11QRTMT6+aywOMA8TDkhPo8aCGw== "@esbuild/linux-arm@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/linux-arm/-/linux-arm-0.21.5.tgz#fc6fd11a8aca56c1f6f3894f2bea0479f8f626b9" integrity sha512-bPb5AHZtbeNGjCKVZ9UGqGwo8EUu4cLq68E95A53KlxAPRmUyYv2D6F0uUI65XisGOL1hBP5mTronbgo+0bFcA== -"@esbuild/linux-arm@0.22.0": - version "0.22.0" - resolved "https://registry.yarnpkg.com/@esbuild/linux-arm/-/linux-arm-0.22.0.tgz#6587d3e423e09766ea997229827e292e7c4acd6f" - integrity sha512-KEMWiA9aGuPUD4BH5yjlhElLgaRXe+Eri6gKBoDazoPBTo1BXc/e6IW5FcJO9DoL19FBeCxgONyh95hLDNepIg== +"@esbuild/linux-arm@0.23.0": + version "0.23.0" + resolved "https://registry.yarnpkg.com/@esbuild/linux-arm/-/linux-arm-0.23.0.tgz#237a8548e3da2c48cd79ae339a588f03d1889aad" + integrity sha512-SEELSTEtOFu5LPykzA395Mc+54RMg1EUgXP+iw2SJ72+ooMwVsgfuwXo5Fn0wXNgWZsTVHwY2cg4Vi/bOD88qw== "@esbuild/linux-ia32@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/linux-ia32/-/linux-ia32-0.21.5.tgz#3271f53b3f93e3d093d518d1649d6d68d346ede2" integrity sha512-YvjXDqLRqPDl2dvRODYmmhz4rPeVKYvppfGYKSNGdyZkA01046pLWyRKKI3ax8fbJoK5QbxblURkwK/MWY18Tg== -"@esbuild/linux-ia32@0.22.0": - version "0.22.0" - resolved "https://registry.yarnpkg.com/@esbuild/linux-ia32/-/linux-ia32-0.22.0.tgz#2d06d7b4abc443e05a820ff50d4c2d98cc04c22f" - integrity sha512-r2ZZqkOMOrpUhzNwxI7uLAHIDwkfeqmTnrv1cjpL/rjllPWszgqmprd/om9oviKXUBpMqHbXmppvjAYgISb26Q== +"@esbuild/linux-ia32@0.23.0": + version "0.23.0" + resolved "https://registry.yarnpkg.com/@esbuild/linux-ia32/-/linux-ia32-0.23.0.tgz#4269cd19cb2de5de03a7ccfc8855dde3d284a238" + integrity sha512-P7O5Tkh2NbgIm2R6x1zGJJsnacDzTFcRWZyTTMgFdVit6E98LTxO+v8LCCLWRvPrjdzXHx9FEOA8oAZPyApWUA== "@esbuild/linux-loong64@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/linux-loong64/-/linux-loong64-0.21.5.tgz#ed62e04238c57026aea831c5a130b73c0f9f26df" integrity sha512-uHf1BmMG8qEvzdrzAqg2SIG/02+4/DHB6a9Kbya0XDvwDEKCoC8ZRWI5JJvNdUjtciBGFQ5PuBlpEOXQj+JQSg== -"@esbuild/linux-loong64@0.22.0": - version "0.22.0" - resolved "https://registry.yarnpkg.com/@esbuild/linux-loong64/-/linux-loong64-0.22.0.tgz#a3e7faabe9a046ac4557bc515ce0981cfe5a6e0f" - integrity sha512-qaowLrV/YOMAL2RfKQ4C/VaDzAuLDuylM2sd/LH+4OFirMl6CuDpRlCq4u49ZBaVV8pkI/Y+hTdiibvQRhojCA== +"@esbuild/linux-loong64@0.23.0": + version "0.23.0" + resolved "https://registry.yarnpkg.com/@esbuild/linux-loong64/-/linux-loong64-0.23.0.tgz#82b568f5658a52580827cc891cb69d2cb4f86280" + integrity sha512-InQwepswq6urikQiIC/kkx412fqUZudBO4SYKu0N+tGhXRWUqAx+Q+341tFV6QdBifpjYgUndV1hhMq3WeJi7A== "@esbuild/linux-mips64el@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/linux-mips64el/-/linux-mips64el-0.21.5.tgz#e79b8eb48bf3b106fadec1ac8240fb97b4e64cbe" integrity sha512-IajOmO+KJK23bj52dFSNCMsz1QP1DqM6cwLUv3W1QwyxkyIWecfafnI555fvSGqEKwjMXVLokcV5ygHW5b3Jbg== -"@esbuild/linux-mips64el@0.22.0": - version "0.22.0" - resolved "https://registry.yarnpkg.com/@esbuild/linux-mips64el/-/linux-mips64el-0.22.0.tgz#3a2877a78f6719e5eed4cfdded5121c5ab9305a4" - integrity sha512-hgrezzjQTRxjkQ5k08J6rtZN5PNnkWx/Rz6Kmj9gnsdCAX1I4Dn4ZPqvFRkXo55Q3pnVQJBwbdtrTO7tMGtyVA== +"@esbuild/linux-mips64el@0.23.0": + version "0.23.0" + resolved "https://registry.yarnpkg.com/@esbuild/linux-mips64el/-/linux-mips64el-0.23.0.tgz#9a57386c926262ae9861c929a6023ed9d43f73e5" + integrity sha512-J9rflLtqdYrxHv2FqXE2i1ELgNjT+JFURt/uDMoPQLcjWQA5wDKgQA4t/dTqGa88ZVECKaD0TctwsUfHbVoi4w== "@esbuild/linux-ppc64@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/linux-ppc64/-/linux-ppc64-0.21.5.tgz#5f2203860a143b9919d383ef7573521fb154c3e4" integrity sha512-1hHV/Z4OEfMwpLO8rp7CvlhBDnjsC3CttJXIhBi+5Aj5r+MBvy4egg7wCbe//hSsT+RvDAG7s81tAvpL2XAE4w== -"@esbuild/linux-ppc64@0.22.0": - version "0.22.0" - resolved "https://registry.yarnpkg.com/@esbuild/linux-ppc64/-/linux-ppc64-0.22.0.tgz#6609478066083e05cc1854a8b272daf62a7e944b" - integrity sha512-ewxg6FLLUio883XgSjfULEmDl3VPv/TYNnRprVAS3QeGFLdCYdx1tIudBcd7n9jIdk82v1Ajov4jx87qW7h9+g== +"@esbuild/linux-ppc64@0.23.0": + version "0.23.0" + resolved "https://registry.yarnpkg.com/@esbuild/linux-ppc64/-/linux-ppc64-0.23.0.tgz#f3a79fd636ba0c82285d227eb20ed8e31b4444f6" + integrity sha512-cShCXtEOVc5GxU0fM+dsFD10qZ5UpcQ8AM22bYj0u/yaAykWnqXJDpd77ublcX6vdDsWLuweeuSNZk4yUxZwtw== "@esbuild/linux-riscv64@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/linux-riscv64/-/linux-riscv64-0.21.5.tgz#07bcafd99322d5af62f618cb9e6a9b7f4bb825dc" integrity sha512-2HdXDMd9GMgTGrPWnJzP2ALSokE/0O5HhTUvWIbD3YdjME8JwvSCnNGBnTThKGEB91OZhzrJ4qIIxk/SBmyDDA== -"@esbuild/linux-riscv64@0.22.0": - version "0.22.0" - resolved "https://registry.yarnpkg.com/@esbuild/linux-riscv64/-/linux-riscv64-0.22.0.tgz#d786a89903cf98e8d34befe6a71c69562bb4ceac" - integrity sha512-Az5XbgSJC2lE8XK8pdcutsf9RgdafWdTpUK/+6uaDdfkviw/B4JCwAfh1qVeRWwOohwdsl4ywZrWBNWxwrPLFg== +"@esbuild/linux-riscv64@0.23.0": + version "0.23.0" + resolved "https://registry.yarnpkg.com/@esbuild/linux-riscv64/-/linux-riscv64-0.23.0.tgz#f9d2ef8356ce6ce140f76029680558126b74c780" + integrity sha512-HEtaN7Y5UB4tZPeQmgz/UhzoEyYftbMXrBCUjINGjh3uil+rB/QzzpMshz3cNUxqXN7Vr93zzVtpIDL99t9aRw== "@esbuild/linux-s390x@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/linux-s390x/-/linux-s390x-0.21.5.tgz#b7ccf686751d6a3e44b8627ababc8be3ef62d8de" integrity sha512-zus5sxzqBJD3eXxwvjN1yQkRepANgxE9lgOW2qLnmr8ikMTphkjgXu1HR01K4FJg8h1kEEDAqDcZQtbrRnB41A== -"@esbuild/linux-s390x@0.22.0": - version "0.22.0" - resolved "https://registry.yarnpkg.com/@esbuild/linux-s390x/-/linux-s390x-0.22.0.tgz#a7ab13ae163307ac615dac5ce7f60a6b0a067d59" - integrity sha512-8j4a2ChT9+V34NNNY9c/gMldutaJFmfMacTPq4KfNKwv2fitBCLYjee7c+Vxaha2nUhPK7cXcZpJtJ3+Y7ZdVQ== +"@esbuild/linux-s390x@0.23.0": + version "0.23.0" + resolved "https://registry.yarnpkg.com/@esbuild/linux-s390x/-/linux-s390x-0.23.0.tgz#45390f12e802201f38a0229e216a6aed4351dfe8" + integrity sha512-WDi3+NVAuyjg/Wxi+o5KPqRbZY0QhI9TjrEEm+8dmpY9Xir8+HE/HNx2JoLckhKbFopW0RdO2D72w8trZOV+Wg== "@esbuild/linux-x64@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/linux-x64/-/linux-x64-0.21.5.tgz#6d8f0c768e070e64309af8004bb94e68ab2bb3b0" integrity sha512-1rYdTpyv03iycF1+BhzrzQJCdOuAOtaqHTWJZCWvijKD2N5Xu0TtVC8/+1faWqcP9iBCWOmjmhoH94dH82BxPQ== -"@esbuild/linux-x64@0.22.0": - version "0.22.0" - resolved "https://registry.yarnpkg.com/@esbuild/linux-x64/-/linux-x64-0.22.0.tgz#24949de431013354da1d8c29e53299798f8c27ef" - integrity sha512-JUQyOnpbAkkRFOk/AhsEemz5TfWN4FJZxVObUlnlNCbe7QBl61ZNfM4cwBXayQA6laMJMUcqLHaYQHAB6YQ95Q== +"@esbuild/linux-x64@0.23.0": + version "0.23.0" + resolved "https://registry.yarnpkg.com/@esbuild/linux-x64/-/linux-x64-0.23.0.tgz#c8409761996e3f6db29abcf9b05bee8d7d80e910" + integrity sha512-a3pMQhUEJkITgAw6e0bWA+F+vFtCciMjW/LPtoj99MhVt+Mfb6bbL9hu2wmTZgNd994qTAEw+U/r6k3qHWWaOQ== "@esbuild/netbsd-x64@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/netbsd-x64/-/netbsd-x64-0.21.5.tgz#bbe430f60d378ecb88decb219c602667387a6047" integrity sha512-Woi2MXzXjMULccIwMnLciyZH4nCIMpWQAs049KEeMvOcNADVxo0UBIQPfSmxB3CWKedngg7sWZdLvLczpe0tLg== -"@esbuild/netbsd-x64@0.22.0": - version "0.22.0" - resolved "https://registry.yarnpkg.com/@esbuild/netbsd-x64/-/netbsd-x64-0.22.0.tgz#bc3f51c41eaab89cf5fdb09d0c633affb39cb1a1" - integrity sha512-11PoCoHXo4HFNbLsXuMB6bpMPWGDiw7xETji6COdJss4SQZLvcgNoeSqWtATRm10Jj1uEHiaIk4N0PiN6x4Fcg== +"@esbuild/netbsd-x64@0.23.0": + version "0.23.0" + resolved "https://registry.yarnpkg.com/@esbuild/netbsd-x64/-/netbsd-x64-0.23.0.tgz#ba70db0114380d5f6cfb9003f1d378ce989cd65c" + integrity sha512-cRK+YDem7lFTs2Q5nEv/HHc4LnrfBCbH5+JHu6wm2eP+d8OZNoSMYgPZJq78vqQ9g+9+nMuIsAO7skzphRXHyw== -"@esbuild/openbsd-arm64@0.22.0": - version "0.22.0" - resolved "https://registry.yarnpkg.com/@esbuild/openbsd-arm64/-/openbsd-arm64-0.22.0.tgz#7cb42e3a0d3da039d1a4b7ccbd0c19b0f71ae453" - integrity sha512-Ezlhu/YyITmXwKSB+Zu/QqD7cxrjrpiw85cc0Rbd3AWr2wsgp+dWbWOE8MqHaLW9NKMZvuL0DhbJbvzR7F6Zvg== +"@esbuild/openbsd-arm64@0.23.0": + version "0.23.0" + resolved "https://registry.yarnpkg.com/@esbuild/openbsd-arm64/-/openbsd-arm64-0.23.0.tgz#72fc55f0b189f7a882e3cf23f332370d69dfd5db" + integrity sha512-suXjq53gERueVWu0OKxzWqk7NxiUWSUlrxoZK7usiF50C6ipColGR5qie2496iKGYNLhDZkPxBI3erbnYkU0rQ== "@esbuild/openbsd-x64@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/openbsd-x64/-/openbsd-x64-0.21.5.tgz#99d1cf2937279560d2104821f5ccce220cb2af70" integrity sha512-HLNNw99xsvx12lFBUwoT8EVCsSvRNDVxNpjZ7bPn947b8gJPzeHWyNVhFsaerc0n3TsbOINvRP2byTZ5LKezow== -"@esbuild/openbsd-x64@0.22.0": - version "0.22.0" - resolved "https://registry.yarnpkg.com/@esbuild/openbsd-x64/-/openbsd-x64-0.22.0.tgz#194aa9915323962e9ea66c5a13ff3e1db272a683" - integrity sha512-ufjdW5tFJGUjlH9j/5cCE9lrwRffyZh+T4vYvoDKoYsC6IXbwaFeV/ENxeNXcxotF0P8CDzoICXVSbJaGBhkrw== +"@esbuild/openbsd-x64@0.23.0": + version "0.23.0" + resolved "https://registry.yarnpkg.com/@esbuild/openbsd-x64/-/openbsd-x64-0.23.0.tgz#b6ae7a0911c18fe30da3db1d6d17a497a550e5d8" + integrity sha512-6p3nHpby0DM/v15IFKMjAaayFhqnXV52aEmv1whZHX56pdkK+MEaLoQWj+H42ssFarP1PcomVhbsR4pkz09qBg== "@esbuild/sunos-x64@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/sunos-x64/-/sunos-x64-0.21.5.tgz#08741512c10d529566baba837b4fe052c8f3487b" integrity sha512-6+gjmFpfy0BHU5Tpptkuh8+uw3mnrvgs+dSPQXQOv3ekbordwnzTVEb4qnIvQcYXq6gzkyTnoZ9dZG+D4garKg== -"@esbuild/sunos-x64@0.22.0": - version "0.22.0" - resolved "https://registry.yarnpkg.com/@esbuild/sunos-x64/-/sunos-x64-0.22.0.tgz#2be9d2459ae181ebedb6470e4469349a27c4f060" - integrity sha512-zY6ly/AoSmKnmNTowDJsK5ehra153/5ZhqxNLfq9NRsTTltetr+yHHcQ4RW7QDqw4JC8A1uC1YmeSfK9NRcK1w== +"@esbuild/sunos-x64@0.23.0": + version "0.23.0" + resolved "https://registry.yarnpkg.com/@esbuild/sunos-x64/-/sunos-x64-0.23.0.tgz#58f0d5e55b9b21a086bfafaa29f62a3eb3470ad8" + integrity sha512-BFelBGfrBwk6LVrmFzCq1u1dZbG4zy/Kp93w2+y83Q5UGYF1d8sCzeLI9NXjKyujjBBniQa8R8PzLFAUrSM9OA== "@esbuild/win32-arm64@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/win32-arm64/-/win32-arm64-0.21.5.tgz#675b7385398411240735016144ab2e99a60fc75d" integrity sha512-Z0gOTd75VvXqyq7nsl93zwahcTROgqvuAcYDUr+vOv8uHhNSKROyU961kgtCD1e95IqPKSQKH7tBTslnS3tA8A== -"@esbuild/win32-arm64@0.22.0": - version "0.22.0" - resolved "https://registry.yarnpkg.com/@esbuild/win32-arm64/-/win32-arm64-0.22.0.tgz#6b4224f2d049c26f37026904210a4293e34c2747" - integrity sha512-Kml5F7tv/1Maam0pbbCrvkk9vj046dPej30kFzlhXnhuCtYYBP6FGy/cLbc5yUT1lkZznGLf2OvuvmLjscO5rw== +"@esbuild/win32-arm64@0.23.0": + version "0.23.0" + resolved "https://registry.yarnpkg.com/@esbuild/win32-arm64/-/win32-arm64-0.23.0.tgz#b858b2432edfad62e945d5c7c9e5ddd0f528ca6d" + integrity sha512-lY6AC8p4Cnb7xYHuIxQ6iYPe6MfO2CC43XXKo9nBXDb35krYt7KGhQnOkRGar5psxYkircpCqfbNDB4uJbS2jQ== "@esbuild/win32-ia32@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/win32-ia32/-/win32-ia32-0.21.5.tgz#1bfc3ce98aa6ca9a0969e4d2af72144c59c1193b" integrity sha512-SWXFF1CL2RVNMaVs+BBClwtfZSvDgtL//G/smwAc5oVK/UPu2Gu9tIaRgFmYFFKrmg3SyAjSrElf0TiJ1v8fYA== -"@esbuild/win32-ia32@0.22.0": - version "0.22.0" - resolved "https://registry.yarnpkg.com/@esbuild/win32-ia32/-/win32-ia32-0.22.0.tgz#4a1184f6fd4a7594c4f1e68b1e649248534f7832" - integrity sha512-IOgwn+mYTM3RrcydP4Og5IpXh+ftN8oF+HELTXSmbWBlujuci4Qa3DTeO+LEErceisI7KUSfEIiX+WOUlpELkw== +"@esbuild/win32-ia32@0.23.0": + version "0.23.0" + resolved "https://registry.yarnpkg.com/@esbuild/win32-ia32/-/win32-ia32-0.23.0.tgz#167ef6ca22a476c6c0c014a58b4f43ae4b80dec7" + integrity sha512-7L1bHlOTcO4ByvI7OXVI5pNN6HSu6pUQq9yodga8izeuB1KcT2UkHaH6118QJwopExPn0rMHIseCTx1CRo/uNA== "@esbuild/win32-x64@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/win32-x64/-/win32-x64-0.21.5.tgz#acad351d582d157bb145535db2a6ff53dd514b5c" integrity sha512-tQd/1efJuzPC6rCFwEvLtci/xNFcTZknmXs98FYDfGE4wP9ClFV98nyKrzJKVPMhdDnjzLhdUyMX4PsQAPjwIw== -"@esbuild/win32-x64@0.22.0": - version "0.22.0" - resolved "https://registry.yarnpkg.com/@esbuild/win32-x64/-/win32-x64-0.22.0.tgz#4b83e9449a205e7d94d5368035450fc1680fe525" - integrity sha512-4bDHJrk2WHBXJPhy1y80X7/5b5iZTZP3LGcKIlAP1J+KqZ4zQAPMLEzftGyjjfcKbA4JDlPt/+2R/F1ZTeRgrw== +"@esbuild/win32-x64@0.23.0": + version "0.23.0" + resolved "https://registry.yarnpkg.com/@esbuild/win32-x64/-/win32-x64-0.23.0.tgz#db44a6a08520b5f25bbe409f34a59f2d4bcc7ced" + integrity sha512-Arm+WgUFLUATuoxCJcahGuk6Yj9Pzxd6l11Zb/2aAuv5kWWvvfhLFo2fni4uSK5vzlUdCGZ/BdV5tH8klj8p8g== "@eslint-community/eslint-utils@^4.2.0", "@eslint-community/eslint-utils@^4.4.0": version "4.4.0" @@ -2934,35 +2934,35 @@ esbuild-plugin-alias@0.2.1: resolved "https://registry.yarnpkg.com/esbuild-plugin-alias/-/esbuild-plugin-alias-0.2.1.tgz#45a86cb941e20e7c2bc68a2bea53562172494fcb" integrity sha512-jyfL/pwPqaFXyKnj8lP8iLk6Z0m099uXR45aSN8Av1XD4vhvQutxxPzgA2bTcAwQpa1zCXDcWOlhFgyP3GKqhQ== -esbuild@0.22.0: - version "0.22.0" - resolved "https://registry.yarnpkg.com/esbuild/-/esbuild-0.22.0.tgz#9742e664aac9f61e2898f4c27bd4dd4272e6f661" - integrity sha512-zNYA6bFZsVnsU481FnGAQjLDW0Pl/8BGG7EvAp15RzUvGC+ME7hf1q7LvIfStEQBz/iEHuBJCYcOwPmNCf1Tlw== +esbuild@0.23.0: + version "0.23.0" + resolved "https://registry.yarnpkg.com/esbuild/-/esbuild-0.23.0.tgz#de06002d48424d9fdb7eb52dbe8e95927f852599" + integrity sha512-1lvV17H2bMYda/WaFb2jLPeHU3zml2k4/yagNMG8Q/YtfMjCwEUZa2eXXMgZTVSL5q1n4H7sQ0X6CdJDqqeCFA== optionalDependencies: - "@esbuild/aix-ppc64" "0.22.0" - "@esbuild/android-arm" "0.22.0" - "@esbuild/android-arm64" "0.22.0" - "@esbuild/android-x64" "0.22.0" - "@esbuild/darwin-arm64" "0.22.0" - "@esbuild/darwin-x64" "0.22.0" - "@esbuild/freebsd-arm64" "0.22.0" - "@esbuild/freebsd-x64" "0.22.0" - "@esbuild/linux-arm" "0.22.0" - "@esbuild/linux-arm64" "0.22.0" - "@esbuild/linux-ia32" "0.22.0" - "@esbuild/linux-loong64" "0.22.0" - "@esbuild/linux-mips64el" "0.22.0" - "@esbuild/linux-ppc64" "0.22.0" - "@esbuild/linux-riscv64" "0.22.0" - "@esbuild/linux-s390x" "0.22.0" - "@esbuild/linux-x64" "0.22.0" - "@esbuild/netbsd-x64" "0.22.0" - "@esbuild/openbsd-arm64" "0.22.0" - "@esbuild/openbsd-x64" "0.22.0" - "@esbuild/sunos-x64" "0.22.0" - "@esbuild/win32-arm64" "0.22.0" - "@esbuild/win32-ia32" "0.22.0" - "@esbuild/win32-x64" "0.22.0" + "@esbuild/aix-ppc64" "0.23.0" + "@esbuild/android-arm" "0.23.0" + "@esbuild/android-arm64" "0.23.0" + "@esbuild/android-x64" "0.23.0" + "@esbuild/darwin-arm64" "0.23.0" + "@esbuild/darwin-x64" "0.23.0" + "@esbuild/freebsd-arm64" "0.23.0" + "@esbuild/freebsd-x64" "0.23.0" + "@esbuild/linux-arm" "0.23.0" + "@esbuild/linux-arm64" "0.23.0" + "@esbuild/linux-ia32" "0.23.0" + "@esbuild/linux-loong64" "0.23.0" + "@esbuild/linux-mips64el" "0.23.0" + "@esbuild/linux-ppc64" "0.23.0" + "@esbuild/linux-riscv64" "0.23.0" + "@esbuild/linux-s390x" "0.23.0" + "@esbuild/linux-x64" "0.23.0" + "@esbuild/netbsd-x64" "0.23.0" + "@esbuild/openbsd-arm64" "0.23.0" + "@esbuild/openbsd-x64" "0.23.0" + "@esbuild/sunos-x64" "0.23.0" + "@esbuild/win32-arm64" "0.23.0" + "@esbuild/win32-ia32" "0.23.0" + "@esbuild/win32-x64" "0.23.0" esbuild@^0.21.5: version "0.21.5" From 0b9f06c78ca549ee3a3aed91b58de740ed90f12e Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Thu, 1 Aug 2024 16:55:02 -0500 Subject: [PATCH 59/73] MINOR: [R] switch on `R_VERSION` (#43504) A follow on to #43243 `#ifdef function declaration` does not work, instead we must use the `#if (R_VERSION >= R_Version(4, 2, 0))` pattern to only include code for specific versions. Lead-authored-by: Jonathan Keane Co-authored-by: Neal Richardson Signed-off-by: Jonathan Keane --- r/src/arrow_cpp11.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/src/arrow_cpp11.h b/r/src/arrow_cpp11.h index 073b577d63ade..c4483ce531f43 100644 --- a/r/src/arrow_cpp11.h +++ b/r/src/arrow_cpp11.h @@ -389,7 +389,7 @@ SEXP to_r6(const std::shared_ptr& ptr, const char* r6_class_name) { // R_existsVarInFrame doesn't exist before R 4.2, so we need to fall back to // Rf_findVarInFrame3 if it is not defined. -#ifdef R_existsVarInFrame +#if R_VERSION >= R_Version(4, 2, 0) if (!R_existsVarInFrame(arrow::r::ns::arrow, r6_class)) { cpp11::stop("No arrow R6 class named '%s'", r6_class_name); } From b35e7ddbefe394ab7e5a6b1c3f3bb476cdc800f2 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 2 Aug 2024 02:10:00 +0200 Subject: [PATCH 60/73] GH-43507: [C++] Use ViewOrCopyTo instead of CopyTo when pretty printing non-CPU data (#43508) ### Rationale for this change When ensuring the data we are pretty-printing is on the CPU, we can use `ViewOrCopyTo` instead of `CopyTo`, in case the data can be viewed as CPU data without a copy. ### Are these changes tested? Yes (I added a test that uses CUDA host memory, which should be a case where it can be viewed and doesn't need to be copied, but of course the test it not actually ensuring we avoid the copy, just that the printing works) ### Are there any user-facing changes? No * GitHub Issue: #43507 Authored-by: Joris Van den Bossche Signed-off-by: Sutou Kouhei --- cpp/src/arrow/pretty_print.cc | 3 ++- python/pyarrow/tests/test_cuda.py | 11 +++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index 53a6953681660..c5905d0c8c5ea 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -394,7 +394,8 @@ class ArrayPrinter : public PrettyPrinter { if (array.device_type() != DeviceAllocationType::kCPU) { // GH-43055: ideally we only copy start/end slices from non-CPU memory // based on the window size that is being printed - ARROW_ASSIGN_OR_RAISE(auto array_cpu, array.CopyTo(default_cpu_memory_manager())); + ARROW_ASSIGN_OR_RAISE(auto array_cpu, + array.ViewOrCopyTo(default_cpu_memory_manager())); RETURN_NOT_OK(VisitArrayInline(*array_cpu, this)); } else { RETURN_NOT_OK(VisitArrayInline(array, this)); diff --git a/python/pyarrow/tests/test_cuda.py b/python/pyarrow/tests/test_cuda.py index 61f784a729f73..8749ab29d0821 100644 --- a/python/pyarrow/tests/test_cuda.py +++ b/python/pyarrow/tests/test_cuda.py @@ -973,6 +973,17 @@ def test_print_array(): assert str(carr) == str(arr) +@pytest.mark.parametrize("size", [10, 100]) +def test_print_array_host(size): + buf = cuda.new_host_buffer(size*8) + np_arr = np.frombuffer(buf, dtype=np.int64) + np_arr[:] = range(size) + + arr = pa.array(range(size), pa.int64()) + carr = pa.Array.from_buffers(pa.int64(), size, [None, buf]) + assert str(carr) == str(arr) + + def make_chunked_array(n_elements_per_chunk, n_chunks): arrs = [] carrs = [] From fd536176a7b19c73b63bd29acf536fbbb2d8083e Mon Sep 17 00:00:00 2001 From: David Li Date: Fri, 2 Aug 2024 09:40:27 +0900 Subject: [PATCH 61/73] GH-43453: [Format] Add Opaque canonical extension type (#43457) ### Rationale for this change Add the newly ratified extension type. ### What changes are included in this PR? The type specification only. ### Are these changes tested? N/A ### Are there any user-facing changes? No. * GitHub Issue: #43453 Lead-authored-by: David Li Co-authored-by: Sutou Kouhei Signed-off-by: David Li --- docs/source/format/CanonicalExtensions.rst | 110 +++++++++++++++++++++ 1 file changed, 110 insertions(+) diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index c258f889dc6ac..1d86fcf23c4f7 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -283,6 +283,116 @@ UUID A specific UUID version is not required or guaranteed. This extension represents UUIDs as FixedSizeBinary(16) with big-endian notation and does not interpret the bytes in any way. +Opaque +====== + +Opaque represents a type that an Arrow-based system received from an external +(often non-Arrow) system, but that it cannot interpret. In this case, it can +pass on Opaque to its clients to at least show that a field exists and +preserve metadata about the type from the other system. + +Extension parameters: + +* Extension name: ``arrow.opaque``. + +* The storage type of this extension is any type. If there is no underlying + data, the storage type should be Null. + +* Extension type parameters: + + * **type_name** = the name of the unknown type in the external system. + * **vendor_name** = the name of the external system. + +* Description of the serialization: + + A valid JSON object containing the parameters as fields. In the future, + additional fields may be added, but all fields current and future are never + required to interpret the array. + + Developers **should not** attempt to enable public semantic interoperability + of Opaque by canonicalizing specific values of these parameters. + +Rationale +--------- + +Interfacing with non-Arrow systems requires a way to handle data that doesn't +have an equivalent Arrow type. In this case, use the Opaque type, which +explicitly represents an unsupported field. Other solutions are inadequate: + +* Raising an error means even one unsupported field makes all operations + impossible, even if (for instance) the user is just trying to view a schema. +* Dropping unsupported columns misleads the user as to the actual schema. +* An extension type may not exist for the unsupported type. +* Generating an extension type on the fly would falsely imply support. + +Applications **should not** make conventions around vendor_name and type_name. +These parameters are meant for human end users to understand what type wasn't +supported. Applications may try to interpret these fields, but must be +prepared for breakage (e.g., when the type becomes supported with a custom +extension type later on). Similarly, **Opaque is not a generic container for +file formats**. Considerations such as MIME types are irrelevant. In both of +these cases, create a custom extension type instead. + +Examples: + +* A Flight SQL service that supports connecting external databases may + encounter columns with unsupported types in external tables. In this case, + it can use the Opaque[Null] type to at least report that a column exists + with a particular name and type name. This lets clients know that a column + exists, but is not supported. Null is used as the storage type here because + only schemas are involved. + + An example of the extension metadata would be:: + + {"type_name": "varray", "vendor_name": "Oracle"} + +* The ADBC PostgreSQL driver gets results as a series of length-prefixed byte + fields. But the driver will not always know how to parse the bytes, as + there may be extensions (e.g. PostGIS). It can use Opaque[Binary] to still + return those bytes to the application, which may be able to parse the data + itself. Opaque differentiates the column from an actual binary column and + makes it clear that the value is directly from PostgreSQL. (A custom + extension type is preferred, but there will always be extensions that the + driver does not know about.) + + An example of the extension metadata would be:: + + {"type_name": "geometry", "vendor_name": "PostGIS"} + +* The ADBC PostgreSQL driver may also know how to parse the bytes, but not + know the intended semantics. For example, `composite types + `_ can add new + semantics to existing types, somewhat like Arrow extension types. The + driver would be able to parse the underlying bytes in this case, but would + still use the Opaque type. + + Consider the example in the PostgreSQL documentation of a ``complex`` type. + Mapping the type to a plain Arrow ``struct`` type would lose meaning, just + like how an Arrow system deciding to treat all extension types by dropping + the extension metadata would be undesirable. Instead, the driver can use + Opaque[Struct] to pass on the composite type info. (It would be wrong to + try to map this to an Arrow-defined complex type: it does not know the + proper semantics of a user-defined type, which cannot and should not be + hardcoded into the driver in the first place.) + + An example of the extension metadata would be:: + + {"type_name": "database_name.schema_name.complex", "vendor_name": "PostgreSQL"} + +* The JDBC adapter in the Arrow Java libraries converts JDBC result sets into + Arrow arrays, and can get Arrow schemas from result sets. JDBC, however, + allows drivers to return `arbitrary Java objects + `_. + + The driver can use Opaque[Null] as a placeholder during schema conversion, + only erroring if the application tries to fetch the actual data. That way, + clients can at least introspect result schemas to decide whether it can + proceed to fetch the data, or only query certain columns. + + An example of the extension metadata would be:: + + {"type_name": "OTHER", "vendor_name": "JDBC driver name"} + ========================= Community Extension Types ========================= From f2f3e95605537e8ffea9886d5c4ee9bbf5f949fd Mon Sep 17 00:00:00 2001 From: David Li Date: Fri, 2 Aug 2024 09:55:01 +0900 Subject: [PATCH 62/73] GH-43456: [Java] Add Opaque canonical extension type (#43460) ### Rationale for this change Add the newly ratified extension type. ### What changes are included in this PR? The Java implementation only. ### Are these changes tested? Yes ### Are there any user-facing changes? No. * GitHub Issue: #43456 Authored-by: David Li Signed-off-by: David Li --- .../jdbc/JdbcToArrowConfigBuilder.java | 2 + .../arrow/adapter/jdbc/JdbcToArrowUtils.java | 22 +- .../jdbc/h2/JdbcToArrowDataTypesTest.java | 51 +++ java/vector/src/main/java/module-info.java | 1 + .../InvalidExtensionMetadataException.java | 28 ++ .../arrow/vector/extension/OpaqueType.java | 393 ++++++++++++++++++ .../arrow/vector/extension/OpaqueVector.java | 58 +++ .../arrow/vector/TestOpaqueExtensionType.java | 188 +++++++++ 8 files changed, 742 insertions(+), 1 deletion(-) create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/extension/InvalidExtensionMetadataException.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/extension/OpaqueType.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/extension/OpaqueVector.java create mode 100644 java/vector/src/test/java/org/apache/arrow/vector/TestOpaqueExtensionType.java diff --git a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrowConfigBuilder.java b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrowConfigBuilder.java index 783a373c6d0a7..ea9ffe55d334a 100644 --- a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrowConfigBuilder.java +++ b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrowConfigBuilder.java @@ -211,6 +211,8 @@ public JdbcToArrowConfigBuilder setTargetBatchSize(int targetBatchSize) { * *

Defaults to wrapping {@link JdbcToArrowUtils#getArrowTypeFromJdbcType(JdbcFieldInfo, * Calendar)}. + * + * @see JdbcToArrowUtils#reportUnsupportedTypesAsUnknown(Function) */ public JdbcToArrowConfigBuilder setJdbcToArrowTypeConverter( Function jdbcToArrowTypeConverter) { diff --git a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrowUtils.java b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrowUtils.java index 8397d4c9e0dc4..aecb734a8bbf7 100644 --- a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrowUtils.java +++ b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrowUtils.java @@ -18,6 +18,7 @@ import static org.apache.arrow.vector.types.FloatingPointPrecision.DOUBLE; import static org.apache.arrow.vector.types.FloatingPointPrecision.SINGLE; +import static org.apache.arrow.vector.types.Types.MinorType; import java.io.IOException; import java.math.RoundingMode; @@ -37,6 +38,7 @@ import java.util.Locale; import java.util.Map; import java.util.TimeZone; +import java.util.function.Function; import org.apache.arrow.adapter.jdbc.consumer.ArrayConsumer; import org.apache.arrow.adapter.jdbc.consumer.BigIntConsumer; import org.apache.arrow.adapter.jdbc.consumer.BinaryConsumer; @@ -80,6 +82,7 @@ import org.apache.arrow.vector.VectorSchemaRoot; import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.extension.OpaqueType; import org.apache.arrow.vector.types.DateUnit; import org.apache.arrow.vector.types.TimeUnit; import org.apache.arrow.vector.types.pojo.ArrowType; @@ -216,11 +219,28 @@ public static ArrowType getArrowTypeFromJdbcType( case Types.STRUCT: return new ArrowType.Struct(); default: - // no-op, shouldn't get here throw new UnsupportedOperationException("Unmapped JDBC type: " + fieldInfo.getJdbcType()); } } + /** + * Wrap a JDBC to Arrow type converter such that {@link UnsupportedOperationException} becomes + * {@link OpaqueType}. + * + * @param typeConverter The type converter to wrap. + * @param vendorName The database name to report as the Opaque type's vendor name. + */ + public static Function reportUnsupportedTypesAsOpaque( + Function typeConverter, String vendorName) { + return (final JdbcFieldInfo fieldInfo) -> { + try { + return typeConverter.apply(fieldInfo); + } catch (UnsupportedOperationException e) { + return new OpaqueType(MinorType.NULL.getType(), fieldInfo.getTypeName(), vendorName); + } + }; + } + /** * Create Arrow {@link Schema} object for the given JDBC {@link java.sql.ResultSetMetaData}. * diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowDataTypesTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowDataTypesTest.java index 5537e1acba2bc..c246bb2bec47e 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowDataTypesTest.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowDataTypesTest.java @@ -32,19 +32,27 @@ import static org.apache.arrow.adapter.jdbc.JdbcToArrowTestHelper.assertTinyIntVectorValues; import static org.apache.arrow.adapter.jdbc.JdbcToArrowTestHelper.assertVarBinaryVectorValues; import static org.apache.arrow.adapter.jdbc.JdbcToArrowTestHelper.assertVarcharVectorValues; +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; +import static org.junit.jupiter.api.Assertions.assertEquals; import java.io.IOException; +import java.sql.DriverManager; +import java.sql.ResultSet; import java.sql.ResultSetMetaData; import java.sql.SQLException; +import java.sql.Statement; import java.util.Arrays; import java.util.Calendar; +import java.util.function.Function; import java.util.stream.Stream; import org.apache.arrow.adapter.jdbc.AbstractJdbcToArrowTest; +import org.apache.arrow.adapter.jdbc.JdbcFieldInfo; import org.apache.arrow.adapter.jdbc.JdbcToArrowConfig; import org.apache.arrow.adapter.jdbc.JdbcToArrowConfigBuilder; import org.apache.arrow.adapter.jdbc.JdbcToArrowTestHelper; import org.apache.arrow.adapter.jdbc.JdbcToArrowUtils; import org.apache.arrow.adapter.jdbc.Table; +import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.BigIntVector; import org.apache.arrow.vector.BitVector; @@ -62,7 +70,12 @@ import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.VectorSchemaRoot; import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.extension.OpaqueType; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.Schema; +import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.MethodSource; @@ -189,6 +202,44 @@ public void testJdbcSchemaMetadata(Table table) throws SQLException, ClassNotFou JdbcToArrowTestHelper.assertFieldMetadataMatchesResultSetMetadata(rsmd, schema); } + @Test + void testOpaqueType() throws SQLException, ClassNotFoundException { + try (BufferAllocator allocator = new RootAllocator()) { + String url = "jdbc:h2:mem:JdbcToArrowTest"; + String driver = "org.h2.Driver"; + Class.forName(driver); + conn = DriverManager.getConnection(url); + try (Statement stmt = conn.createStatement()) { + stmt.executeUpdate("CREATE TABLE unknowntype (a GEOMETRY, b INT)"); + } + + String query = "SELECT * FROM unknowntype"; + Calendar calendar = Calendar.getInstance(); + Function typeConverter = + (field) -> JdbcToArrowUtils.getArrowTypeFromJdbcType(field, calendar); + JdbcToArrowConfig config = + new JdbcToArrowConfigBuilder() + .setAllocator(allocator) + .setJdbcToArrowTypeConverter( + JdbcToArrowUtils.reportUnsupportedTypesAsOpaque(typeConverter, "H2")) + .build(); + Schema schema; + try (Statement stmt = conn.createStatement(); + ResultSet rs = stmt.executeQuery(query)) { + schema = + assertDoesNotThrow(() -> JdbcToArrowUtils.jdbcToArrowSchema(rs.getMetaData(), config)); + } + + Schema expected = + new Schema( + Arrays.asList( + Field.nullable( + "A", new OpaqueType(Types.MinorType.NULL.getType(), "GEOMETRY", "H2")), + Field.nullable("B", Types.MinorType.INT.getType()))); + assertEquals(expected, schema); + } + } + /** * This method calls the assert methods for various DataSets. * diff --git a/java/vector/src/main/java/module-info.java b/java/vector/src/main/java/module-info.java index fdea2bd06726e..8ba1b3579e0e1 100644 --- a/java/vector/src/main/java/module-info.java +++ b/java/vector/src/main/java/module-info.java @@ -25,6 +25,7 @@ exports org.apache.arrow.vector.complex.writer; exports org.apache.arrow.vector.compression; exports org.apache.arrow.vector.dictionary; + exports org.apache.arrow.vector.extension; exports org.apache.arrow.vector.holders; exports org.apache.arrow.vector.ipc; exports org.apache.arrow.vector.ipc.message; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/extension/InvalidExtensionMetadataException.java b/java/vector/src/main/java/org/apache/arrow/vector/extension/InvalidExtensionMetadataException.java new file mode 100644 index 0000000000000..2349a7d4bc28d --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/extension/InvalidExtensionMetadataException.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.extension; + +/** The extension metadata was malformed. */ +public class InvalidExtensionMetadataException extends RuntimeException { + public InvalidExtensionMetadataException(String message) { + super(message); + } + + public InvalidExtensionMetadataException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/extension/OpaqueType.java b/java/vector/src/main/java/org/apache/arrow/vector/extension/OpaqueType.java new file mode 100644 index 0000000000000..a0e898a543ff6 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/extension/OpaqueType.java @@ -0,0 +1,393 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.extension; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ObjectNode; +import java.util.Collections; +import java.util.Objects; +import java.util.concurrent.atomic.AtomicBoolean; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.BitVector; +import org.apache.arrow.vector.DateDayVector; +import org.apache.arrow.vector.DateMilliVector; +import org.apache.arrow.vector.Decimal256Vector; +import org.apache.arrow.vector.DecimalVector; +import org.apache.arrow.vector.DurationVector; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.FixedSizeBinaryVector; +import org.apache.arrow.vector.Float2Vector; +import org.apache.arrow.vector.Float4Vector; +import org.apache.arrow.vector.Float8Vector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.IntervalDayVector; +import org.apache.arrow.vector.IntervalMonthDayNanoVector; +import org.apache.arrow.vector.IntervalYearVector; +import org.apache.arrow.vector.LargeVarBinaryVector; +import org.apache.arrow.vector.LargeVarCharVector; +import org.apache.arrow.vector.NullVector; +import org.apache.arrow.vector.TimeMicroVector; +import org.apache.arrow.vector.TimeMilliVector; +import org.apache.arrow.vector.TimeNanoVector; +import org.apache.arrow.vector.TimeSecVector; +import org.apache.arrow.vector.TimeStampMicroTZVector; +import org.apache.arrow.vector.TimeStampMicroVector; +import org.apache.arrow.vector.TimeStampMilliTZVector; +import org.apache.arrow.vector.TimeStampMilliVector; +import org.apache.arrow.vector.TimeStampNanoTZVector; +import org.apache.arrow.vector.TimeStampNanoVector; +import org.apache.arrow.vector.TimeStampSecTZVector; +import org.apache.arrow.vector.TimeStampSecVector; +import org.apache.arrow.vector.VarBinaryVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.ViewVarBinaryVector; +import org.apache.arrow.vector.ViewVarCharVector; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.ExtensionTypeRegistry; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; + +/** + * Opaque is a placeholder for a type from an external (usually non-Arrow) system that could not be + * interpreted. + */ +public class OpaqueType extends ArrowType.ExtensionType { + private static final AtomicBoolean registered = new AtomicBoolean(false); + public static final String EXTENSION_NAME = "arrow.opaque"; + private final ArrowType storageType; + private final String typeName; + private final String vendorName; + + /** Register the extension type so it can be used globally. */ + public static void ensureRegistered() { + if (!registered.getAndSet(true)) { + // The values don't matter, we just need an instance + ExtensionTypeRegistry.register(new OpaqueType(Types.MinorType.NULL.getType(), "", "")); + } + } + + /** + * Create a new type instance. + * + * @param storageType The underlying Arrow type. + * @param typeName The name of the unknown type. + * @param vendorName The name of the originating system of the unknown type. + */ + public OpaqueType(ArrowType storageType, String typeName, String vendorName) { + this.storageType = Objects.requireNonNull(storageType, "storageType"); + this.typeName = Objects.requireNonNull(typeName, "typeName"); + this.vendorName = Objects.requireNonNull(vendorName, "vendorName"); + } + + @Override + public ArrowType storageType() { + return storageType; + } + + public String typeName() { + return typeName; + } + + public String vendorName() { + return vendorName; + } + + @Override + public String extensionName() { + return EXTENSION_NAME; + } + + @Override + public boolean extensionEquals(ExtensionType other) { + return other != null + && EXTENSION_NAME.equals(other.extensionName()) + && other instanceof OpaqueType + && storageType.equals(other.storageType()) + && typeName.equals(((OpaqueType) other).typeName()) + && vendorName.equals(((OpaqueType) other).vendorName()); + } + + @Override + public String serialize() { + ObjectMapper mapper = new ObjectMapper(); + ObjectNode object = mapper.createObjectNode(); + object.put("type_name", typeName); + object.put("vendor_name", vendorName); + try { + return mapper.writeValueAsString(object); + } catch (JsonProcessingException e) { + throw new RuntimeException("Could not serialize " + this, e); + } + } + + @Override + public ArrowType deserialize(ArrowType storageType, String serializedData) { + ObjectMapper mapper = new ObjectMapper(); + JsonNode object; + try { + object = mapper.readTree(serializedData); + } catch (JsonProcessingException e) { + throw new InvalidExtensionMetadataException("Extension metadata is invalid", e); + } + JsonNode typeName = object.get("type_name"); + JsonNode vendorName = object.get("vendor_name"); + if (typeName == null) { + throw new InvalidExtensionMetadataException("typeName is missing"); + } + if (vendorName == null) { + throw new InvalidExtensionMetadataException("vendorName is missing"); + } + if (!typeName.isTextual()) { + throw new InvalidExtensionMetadataException("typeName should be string, was " + typeName); + } + if (!vendorName.isTextual()) { + throw new InvalidExtensionMetadataException("vendorName should be string, was " + vendorName); + } + return new OpaqueType(storageType, typeName.asText(), vendorName.asText()); + } + + @Override + public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator) { + // XXX: fieldType is supposed to be the extension type + final Field field = new Field(name, fieldType, Collections.emptyList()); + final FieldVector underlyingVector = + storageType.accept(new UnderlyingVectorTypeVisitor(name, allocator)); + return new OpaqueVector(field, allocator, underlyingVector); + } + + @Override + public int hashCode() { + return Objects.hash(super.hashCode(), storageType, typeName, vendorName); + } + + @Override + public String toString() { + return "OpaqueType(" + + storageType + + ", typeName='" + + typeName + + '\'' + + ", vendorName='" + + vendorName + + '\'' + + ')'; + } + + private static class UnderlyingVectorTypeVisitor implements ArrowTypeVisitor { + private final String name; + private final BufferAllocator allocator; + + UnderlyingVectorTypeVisitor(String name, BufferAllocator allocator) { + this.name = name; + this.allocator = allocator; + } + + @Override + public FieldVector visit(Null type) { + return new NullVector(name); + } + + private RuntimeException unsupported(ArrowType type) { + throw new UnsupportedOperationException( + "OpaqueType#getUnderlyingVector is not supported for storage type: " + type); + } + + @Override + public FieldVector visit(Struct type) { + throw unsupported(type); + } + + @Override + public FieldVector visit(List type) { + throw unsupported(type); + } + + @Override + public FieldVector visit(LargeList type) { + throw unsupported(type); + } + + @Override + public FieldVector visit(FixedSizeList type) { + throw unsupported(type); + } + + @Override + public FieldVector visit(Union type) { + throw unsupported(type); + } + + @Override + public FieldVector visit(Map type) { + throw unsupported(type); + } + + @Override + public FieldVector visit(Int type) { + return new IntVector(name, allocator); + } + + @Override + public FieldVector visit(FloatingPoint type) { + switch (type.getPrecision()) { + case HALF: + return new Float2Vector(name, allocator); + case SINGLE: + return new Float4Vector(name, allocator); + case DOUBLE: + return new Float8Vector(name, allocator); + default: + throw unsupported(type); + } + } + + @Override + public FieldVector visit(Utf8 type) { + return new VarCharVector(name, allocator); + } + + @Override + public FieldVector visit(Utf8View type) { + return new ViewVarCharVector(name, allocator); + } + + @Override + public FieldVector visit(LargeUtf8 type) { + return new LargeVarCharVector(name, allocator); + } + + @Override + public FieldVector visit(Binary type) { + return new VarBinaryVector(name, allocator); + } + + @Override + public FieldVector visit(BinaryView type) { + return new ViewVarBinaryVector(name, allocator); + } + + @Override + public FieldVector visit(LargeBinary type) { + return new LargeVarBinaryVector(name, allocator); + } + + @Override + public FieldVector visit(FixedSizeBinary type) { + return new FixedSizeBinaryVector(Field.nullable(name, type), allocator); + } + + @Override + public FieldVector visit(Bool type) { + return new BitVector(name, allocator); + } + + @Override + public FieldVector visit(Decimal type) { + if (type.getBitWidth() == 128) { + return new DecimalVector(Field.nullable(name, type), allocator); + } else if (type.getBitWidth() == 256) { + return new Decimal256Vector(Field.nullable(name, type), allocator); + } + throw unsupported(type); + } + + @Override + public FieldVector visit(Date type) { + switch (type.getUnit()) { + case DAY: + return new DateDayVector(name, allocator); + case MILLISECOND: + return new DateMilliVector(name, allocator); + default: + throw unsupported(type); + } + } + + @Override + public FieldVector visit(Time type) { + switch (type.getUnit()) { + case SECOND: + return new TimeSecVector(name, allocator); + case MILLISECOND: + return new TimeMilliVector(name, allocator); + case MICROSECOND: + return new TimeMicroVector(name, allocator); + case NANOSECOND: + return new TimeNanoVector(name, allocator); + default: + throw unsupported(type); + } + } + + @Override + public FieldVector visit(Timestamp type) { + if (type.getTimezone() == null || type.getTimezone().isEmpty()) { + switch (type.getUnit()) { + case SECOND: + return new TimeStampSecVector(Field.nullable(name, type), allocator); + case MILLISECOND: + return new TimeStampMilliVector(Field.nullable(name, type), allocator); + case MICROSECOND: + return new TimeStampMicroVector(Field.nullable(name, type), allocator); + case NANOSECOND: + return new TimeStampNanoVector(Field.nullable(name, type), allocator); + default: + throw unsupported(type); + } + } + switch (type.getUnit()) { + case SECOND: + return new TimeStampSecTZVector(Field.nullable(name, type), allocator); + case MILLISECOND: + return new TimeStampMilliTZVector(Field.nullable(name, type), allocator); + case MICROSECOND: + return new TimeStampMicroTZVector(Field.nullable(name, type), allocator); + case NANOSECOND: + return new TimeStampNanoTZVector(Field.nullable(name, type), allocator); + default: + throw unsupported(type); + } + } + + @Override + public FieldVector visit(Interval type) { + switch (type.getUnit()) { + case YEAR_MONTH: + return new IntervalYearVector(name, allocator); + case DAY_TIME: + return new IntervalDayVector(name, allocator); + case MONTH_DAY_NANO: + return new IntervalMonthDayNanoVector(name, allocator); + default: + throw unsupported(type); + } + } + + @Override + public FieldVector visit(Duration type) { + return new DurationVector(Field.nullable(name, type), allocator); + } + + @Override + public FieldVector visit(ListView type) { + throw unsupported(type); + } + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/extension/OpaqueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/extension/OpaqueVector.java new file mode 100644 index 0000000000000..00eb9a984e6bf --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/extension/OpaqueVector.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.extension; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.util.hash.ArrowBufHasher; +import org.apache.arrow.vector.ExtensionTypeVector; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.ValueIterableVector; +import org.apache.arrow.vector.types.pojo.Field; + +/** + * Opaque is a wrapper for (usually binary) data from an external (often non-Arrow) system that + * could not be interpreted. + */ +public class OpaqueVector extends ExtensionTypeVector + implements ValueIterableVector { + private final Field field; + + public OpaqueVector(Field field, BufferAllocator allocator, FieldVector underlyingVector) { + super(field, allocator, underlyingVector); + this.field = field; + } + + @Override + public Field getField() { + return field; + } + + @Override + public Object getObject(int index) { + return getUnderlyingVector().getObject(index); + } + + @Override + public int hashCode(int index) { + return hashCode(index, null); + } + + @Override + public int hashCode(int index, ArrowBufHasher hasher) { + return getUnderlyingVector().hashCode(index, hasher); + } +} diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestOpaqueExtensionType.java b/java/vector/src/test/java/org/apache/arrow/vector/TestOpaqueExtensionType.java new file mode 100644 index 0000000000000..9fd9b580b361f --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestOpaqueExtensionType.java @@ -0,0 +1,188 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector; + +import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotEquals; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Collections; +import java.util.stream.Stream; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.dictionary.DictionaryProvider; +import org.apache.arrow.vector.extension.InvalidExtensionMetadataException; +import org.apache.arrow.vector.extension.OpaqueType; +import org.apache.arrow.vector.extension.OpaqueVector; +import org.apache.arrow.vector.ipc.ArrowStreamReader; +import org.apache.arrow.vector.ipc.ArrowStreamWriter; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.types.pojo.Schema; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; +import org.junit.jupiter.params.provider.ValueSource; + +class TestOpaqueExtensionType { + BufferAllocator allocator; + + @BeforeEach + void beforeEach() { + allocator = new RootAllocator(); + } + + @AfterEach + void afterEach() { + allocator.close(); + } + + @ParameterizedTest + @ValueSource( + strings = { + "{\"type_name\": \"\", \"vendor_name\": \"\"}", + "{\"type_name\": \"\", \"vendor_name\": \"\", \"extra_field\": 42}", + "{\"type_name\": \"array\", \"vendor_name\": \"postgresql\"}", + "{\"type_name\": \"foo.bar\", \"vendor_name\": \"postgresql\"}", + }) + void testDeserializeValid(String serialized) { + ArrowType storageType = Types.MinorType.NULL.getType(); + OpaqueType type = new OpaqueType(storageType, "", ""); + + assertDoesNotThrow(() -> type.deserialize(storageType, serialized)); + } + + @ParameterizedTest + @ValueSource( + strings = { + "", + "{\"type_name\": \"\"}", + "{\"vendor_name\": \"\"}", + "{\"type_name\": null, \"vendor_name\": \"\"}", + "{\"type_name\": \"\", \"vendor_name\": null}", + "{\"type_name\": 42, \"vendor_name\": \"\"}", + "{\"type_name\": \"\", \"vendor_name\": 42}", + "{\"type_name\": \"\", \"vendor_name\": \"\"", + }) + void testDeserializeInvalid(String serialized) { + ArrowType storageType = Types.MinorType.NULL.getType(); + OpaqueType type = new OpaqueType(storageType, "", ""); + + assertThrows( + InvalidExtensionMetadataException.class, () -> type.deserialize(storageType, serialized)); + } + + @ParameterizedTest + @MethodSource("storageType") + void testRoundTrip(ArrowType storageType) { + OpaqueType type = new OpaqueType(storageType, "foo", "bar"); + assertEquals(storageType, type.storageType()); + assertEquals("foo", type.typeName()); + if (storageType.isComplex()) { + assertThrows( + UnsupportedOperationException.class, + () -> type.getNewVector("name", FieldType.nullable(type), allocator)); + } else { + assertDoesNotThrow(() -> type.getNewVector("name", FieldType.nullable(type), allocator)) + .close(); + } + + String serialized = assertDoesNotThrow(type::serialize); + OpaqueType holder = new OpaqueType(Types.MinorType.NULL.getType(), "", ""); + OpaqueType deserialized = (OpaqueType) holder.deserialize(storageType, serialized); + assertEquals(type, deserialized); + assertNotEquals(holder, deserialized); + } + + @ParameterizedTest + @MethodSource("storageType") + void testIpcRoundTrip(ArrowType storageType) { + OpaqueType.ensureRegistered(); + + OpaqueType type = new OpaqueType(storageType, "foo", "bar"); + Schema schema = new Schema(Collections.singletonList(Field.nullable("unknown", type))); + byte[] serialized = schema.serializeAsMessage(); + Schema deseralized = Schema.deserializeMessage(ByteBuffer.wrap(serialized)); + assertEquals(schema, deseralized); + } + + @Test + void testVectorType() throws IOException { + OpaqueType.ensureRegistered(); + + ArrowType storageType = Types.MinorType.VARBINARY.getType(); + OpaqueType type = new OpaqueType(storageType, "foo", "bar"); + try (FieldVector vector = type.getNewVector("field", FieldType.nullable(type), allocator)) { + OpaqueVector opaque = assertInstanceOf(OpaqueVector.class, vector); + assertEquals("field", opaque.getField().getName()); + assertEquals(type, opaque.getField().getType()); + + VarBinaryVector binary = + assertInstanceOf(VarBinaryVector.class, opaque.getUnderlyingVector()); + binary.setSafe(0, new byte[] {0, 1, 2, 3}); + binary.setNull(1); + opaque.setValueCount(2); + + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + try (VectorSchemaRoot root = new VectorSchemaRoot(Collections.singletonList(opaque)); + ArrowStreamWriter writer = + new ArrowStreamWriter(root, new DictionaryProvider.MapDictionaryProvider(), baos)) { + writer.start(); + writer.writeBatch(); + } + + try (ArrowStreamReader reader = + new ArrowStreamReader(new ByteArrayInputStream(baos.toByteArray()), allocator)) { + assertTrue(reader.loadNextBatch()); + VectorSchemaRoot root = reader.getVectorSchemaRoot(); + assertEquals(2, root.getRowCount()); + assertEquals(new Schema(Collections.singletonList(opaque.getField())), root.getSchema()); + + OpaqueVector actual = assertInstanceOf(OpaqueVector.class, root.getVector("field")); + assertFalse(actual.isNull(0)); + assertTrue(actual.isNull(1)); + assertArrayEquals(new byte[] {0, 1, 2, 3}, (byte[]) actual.getObject(0)); + assertNull(actual.getObject(1)); + } + } + } + + static Stream storageType() { + return Stream.of( + Types.MinorType.NULL.getType(), + Types.MinorType.BIGINT.getType(), + Types.MinorType.BIT.getType(), + Types.MinorType.VARBINARY.getType(), + Types.MinorType.VARCHAR.getType(), + Types.MinorType.LIST.getType(), + new ArrowType.Decimal(12, 4, 128)); + } +} From 8068554a5d18129121f824978c7ee4a9c430640b Mon Sep 17 00:00:00 2001 From: Vibhatha Lakmal Abeykoon Date: Fri, 2 Aug 2024 06:34:34 +0530 Subject: [PATCH 63/73] GH-43512: [Java] ListViewVector Visitor-based component Integration (#43513) ### Rationale for this change This PR integrates the core visitor-based components to ListViewVector. ### What changes are included in this PR? This PR includes the addition of `RangeEqualsVisitor`, `TypeEqualsVisitor` and necessary test cases. ### Are these changes tested? Yes ### Are there any user-facing changes? No * GitHub Issue: #43512 Authored-by: Vibhatha Abeykoon Signed-off-by: David Li --- .../vector/compare/RangeEqualsVisitor.java | 57 ++++++++++ .../vector/compare/TypeEqualsVisitor.java | 6 ++ .../arrow/vector/compare/VectorVisitor.java | 5 + .../arrow/vector/complex/ListViewVector.java | 2 +- .../apache/arrow/vector/TestValueVector.java | 60 +++++++++++ .../compare/TestRangeEqualsVisitor.java | 100 ++++++++++++++++++ .../vector/compare/TestTypeEqualsVisitor.java | 17 +++ 7 files changed, 246 insertions(+), 1 deletion(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java index 3050649737355..fbc28a3609c07 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java @@ -32,10 +32,12 @@ import org.apache.arrow.vector.NullVector; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.complex.BaseRepeatedValueVector; +import org.apache.arrow.vector.complex.BaseRepeatedValueViewVector; import org.apache.arrow.vector.complex.DenseUnionVector; import org.apache.arrow.vector.complex.FixedSizeListVector; import org.apache.arrow.vector.complex.LargeListVector; import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.ListViewVector; import org.apache.arrow.vector.complex.NonNullableStructVector; import org.apache.arrow.vector.complex.UnionVector; @@ -234,6 +236,14 @@ public Boolean visit(ExtensionTypeVector left, Range range) { return underlyingVisitor.rangeEquals(range); } + @Override + public Boolean visit(ListViewVector left, Range range) { + if (!validate(left)) { + return false; + } + return compareListViewVectors(range); + } + protected RangeEqualsVisitor createInnerVisitor( ValueVector leftInner, ValueVector rightInner, @@ -702,4 +712,51 @@ protected boolean compareLargeListVectors(Range range) { } return true; } + + protected boolean compareListViewVectors(Range range) { + ListViewVector leftVector = (ListViewVector) left; + ListViewVector rightVector = (ListViewVector) right; + + RangeEqualsVisitor innerVisitor = + createInnerVisitor( + leftVector.getDataVector(), rightVector.getDataVector(), /*type comparator*/ null); + Range innerRange = new Range(); + + for (int i = 0; i < range.getLength(); i++) { + int leftIndex = range.getLeftStart() + i; + int rightIndex = range.getRightStart() + i; + + boolean isNull = leftVector.isNull(leftIndex); + if (isNull != rightVector.isNull(rightIndex)) { + return false; + } + + int offsetWidth = BaseRepeatedValueViewVector.OFFSET_WIDTH; + int sizeWidth = BaseRepeatedValueViewVector.SIZE_WIDTH; + + if (!isNull) { + final int startIndexLeft = + leftVector.getOffsetBuffer().getInt((long) leftIndex * offsetWidth); + final int leftSize = leftVector.getSizeBuffer().getInt((long) leftIndex * sizeWidth); + + final int startIndexRight = + rightVector.getOffsetBuffer().getInt((long) rightIndex * offsetWidth); + final int rightSize = rightVector.getSizeBuffer().getInt((long) rightIndex * sizeWidth); + + if (leftSize != rightSize) { + return false; + } + + innerRange = + innerRange + .setRightStart(startIndexRight) + .setLeftStart(startIndexLeft) + .setLength(leftSize); + if (!innerVisitor.rangeEquals(innerRange)) { + return false; + } + } + } + return true; + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java index 15cc2c31b8b98..6e15d6a83e7d9 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java @@ -29,6 +29,7 @@ import org.apache.arrow.vector.complex.FixedSizeListVector; import org.apache.arrow.vector.complex.LargeListVector; import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.ListViewVector; import org.apache.arrow.vector.complex.NonNullableStructVector; import org.apache.arrow.vector.complex.UnionVector; import org.apache.arrow.vector.types.pojo.Field; @@ -124,6 +125,11 @@ public Boolean visit(ExtensionTypeVector left, Void value) { return compareField(left.getField(), right.getField()); } + @Override + public Boolean visit(ListViewVector left, Void value) { + return compareField(left.getField(), right.getField()); + } + private boolean compareField(Field leftField, Field rightField) { if (leftField == rightField) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorVisitor.java index 870f015862764..c912359d4af5d 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorVisitor.java @@ -26,6 +26,7 @@ import org.apache.arrow.vector.complex.FixedSizeListVector; import org.apache.arrow.vector.complex.LargeListVector; import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.ListViewVector; import org.apache.arrow.vector.complex.NonNullableStructVector; import org.apache.arrow.vector.complex.UnionVector; @@ -60,4 +61,8 @@ public interface VectorVisitor { OUT visit(NullVector left, IN value); OUT visit(ExtensionTypeVector left, IN value); + + default OUT visit(ListViewVector left, IN value) { + throw new UnsupportedOperationException("VectorVisitor for ListViewVector is not supported."); + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListViewVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListViewVector.java index d719c9b1a9a4e..6ced66d81ec21 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListViewVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListViewVector.java @@ -341,7 +341,7 @@ public void copyFromSafe(int inIndex, int outIndex, ValueVector from) { @Override public OUT accept(VectorVisitor visitor, IN value) { - throw new UnsupportedOperationException("ListViewVector does not support visitor pattern."); + return visitor.visit(this, value); } @Override diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java index 376ad3ec7504f..4dd55afdb8b04 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java @@ -2882,6 +2882,34 @@ public void testListVectorEqualsWithNull() { } } + @Test + public void testListViewVectorEqualsWithNull() { + try (final ListViewVector vector1 = ListViewVector.empty("listview", allocator); + final ListViewVector vector2 = ListViewVector.empty("listview", allocator); ) { + + UnionListViewWriter writer1 = vector1.getWriter(); + writer1.allocate(); + + // set some values + writeListViewVector(writer1, new int[] {1, 2}); + writeListViewVector(writer1, new int[] {3, 4}); + writeListViewVector(writer1, new int[] {}); + writer1.setValueCount(3); + + UnionListViewWriter writer2 = vector2.getWriter(); + writer2.allocate(); + + // set some values + writeListViewVector(writer2, new int[] {1, 2}); + writeListViewVector(writer2, new int[] {3, 4}); + writer2.setValueCount(3); + + VectorEqualsVisitor visitor = new VectorEqualsVisitor(); + + assertFalse(visitor.vectorEquals(vector1, vector2)); + } + } + @Test public void testListVectorEquals() { try (final ListVector vector1 = ListVector.empty("list", allocator); @@ -2914,6 +2942,38 @@ public void testListVectorEquals() { } } + @Test + public void testListViewVectorEquals() { + try (final ListViewVector vector1 = ListViewVector.empty("listview", allocator); + final ListViewVector vector2 = ListViewVector.empty("listview", allocator); ) { + + UnionListViewWriter writer1 = vector1.getWriter(); + writer1.allocate(); + + // set some values + writeListViewVector(writer1, new int[] {1, 2}); + writeListViewVector(writer1, new int[] {3, 4}); + writeListViewVector(writer1, new int[] {5, 6}); + writer1.setValueCount(3); + + UnionListViewWriter writer2 = vector2.getWriter(); + writer2.allocate(); + + // set some values + writeListViewVector(writer2, new int[] {1, 2}); + writeListViewVector(writer2, new int[] {3, 4}); + writer2.setValueCount(2); + + VectorEqualsVisitor visitor = new VectorEqualsVisitor(); + assertFalse(visitor.vectorEquals(vector1, vector2)); + + writeListViewVector(writer2, new int[] {5, 6}); + writer2.setValueCount(3); + + assertTrue(visitor.vectorEquals(vector1, vector2)); + } + } + @Test public void testListVectorSetNull() { try (final ListVector vector = ListVector.empty("list", allocator)) { diff --git a/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java b/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java index bab8c737f6a7d..7e91b76043057 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java @@ -37,10 +37,12 @@ import org.apache.arrow.vector.complex.DenseUnionVector; import org.apache.arrow.vector.complex.FixedSizeListVector; import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.ListViewVector; import org.apache.arrow.vector.complex.StructVector; import org.apache.arrow.vector.complex.UnionVector; import org.apache.arrow.vector.complex.impl.NullableStructWriter; import org.apache.arrow.vector.complex.impl.UnionFixedSizeListWriter; +import org.apache.arrow.vector.complex.impl.UnionListViewWriter; import org.apache.arrow.vector.complex.impl.UnionListWriter; import org.apache.arrow.vector.holders.NullableBigIntHolder; import org.apache.arrow.vector.holders.NullableFloat4Holder; @@ -201,6 +203,24 @@ public void testListVectorWithDifferentChild() { } } + @Test + public void testListViewVectorWithDifferentChild() { + try (final ListViewVector vector1 = ListViewVector.empty("listview", allocator); + final ListViewVector vector2 = ListViewVector.empty("listview", allocator); ) { + + vector1.allocateNew(); + vector1.initializeChildrenFromFields( + Arrays.asList(Field.nullable("child", new ArrowType.Int(32, true)))); + + vector2.allocateNew(); + vector2.initializeChildrenFromFields( + Arrays.asList(Field.nullable("child", new ArrowType.Int(64, true)))); + + RangeEqualsVisitor visitor = new RangeEqualsVisitor(vector1, vector2); + assertFalse(visitor.rangeEquals(new Range(0, 0, 0))); + } + } + @Test public void testListVectorRangeEquals() { try (final ListVector vector1 = ListVector.empty("list", allocator); @@ -233,6 +253,38 @@ public void testListVectorRangeEquals() { } } + @Test + public void testListViewVectorRangeEquals() { + try (final ListViewVector vector1 = ListViewVector.empty("listview", allocator); + final ListViewVector vector2 = ListViewVector.empty("listview", allocator); ) { + + UnionListViewWriter writer1 = vector1.getWriter(); + writer1.allocate(); + + // set some values + writeListViewVector(writer1, new int[] {1, 2}); + writeListViewVector(writer1, new int[] {3, 4}); + writeListViewVector(writer1, new int[] {5, 6}); + writeListViewVector(writer1, new int[] {7, 8}); + writeListViewVector(writer1, new int[] {9, 10}); + writer1.setValueCount(5); + + UnionListViewWriter writer2 = vector2.getWriter(); + writer2.allocate(); + + // set some values + writeListViewVector(writer2, new int[] {0, 0}); + writeListViewVector(writer2, new int[] {3, 4}); + writeListViewVector(writer2, new int[] {5, 6}); + writeListViewVector(writer2, new int[] {7, 8}); + writeListViewVector(writer2, new int[] {0, 0}); + writer2.setValueCount(5); + + RangeEqualsVisitor visitor = new RangeEqualsVisitor(vector1, vector2); + assertTrue(visitor.rangeEquals(new Range(1, 1, 3))); + } + } + @Test public void testBitVectorRangeEquals() { try (final BitVector vector1 = new BitVector("v1", allocator); @@ -819,6 +871,38 @@ public void testListVectorApproxEquals() { } } + @Test + public void testListViewVectorApproxEquals() { + try (final ListViewVector right = ListViewVector.empty("listview", allocator); + final ListViewVector left1 = ListViewVector.empty("listview", allocator); + final ListViewVector left2 = ListViewVector.empty("listview", allocator); ) { + + final float epsilon = 1.0E-6f; + + UnionListViewWriter rightWriter = right.getWriter(); + rightWriter.allocate(); + writeListViewVector(rightWriter, new double[] {1, 2}); + writeListViewVector(rightWriter, new double[] {1.01, 2.02}); + rightWriter.setValueCount(2); + + UnionListViewWriter leftWriter1 = left1.getWriter(); + leftWriter1.allocate(); + writeListViewVector(leftWriter1, new double[] {1, 2}); + writeListViewVector(leftWriter1, new double[] {1.01 + epsilon / 2, 2.02 - epsilon / 2}); + leftWriter1.setValueCount(2); + + UnionListViewWriter leftWriter2 = left2.getWriter(); + leftWriter2.allocate(); + writeListViewVector(leftWriter2, new double[] {1, 2}); + writeListViewVector(leftWriter2, new double[] {1.01 + epsilon * 2, 2.02 - epsilon * 2}); + leftWriter2.setValueCount(2); + + Range range = new Range(0, 0, right.getValueCount()); + assertTrue(new ApproxEqualsVisitor(left1, right, epsilon, epsilon).rangeEquals(range)); + assertFalse(new ApproxEqualsVisitor(left2, right, epsilon, epsilon).rangeEquals(range)); + } + } + private void writeStructVector(NullableStructWriter writer, int value1, long value2) { writer.start(); writer.integer("f0").writeInt(value1); @@ -841,6 +925,14 @@ private void writeListVector(UnionListWriter writer, int[] values) { writer.endList(); } + private void writeListViewVector(UnionListViewWriter writer, int[] values) { + writer.startListView(); + for (int v : values) { + writer.integer().writeInt(v); + } + writer.endListView(); + } + private void writeFixedSizeListVector(UnionFixedSizeListWriter writer, int[] values) { writer.startList(); for (int v : values) { @@ -856,4 +948,12 @@ private void writeListVector(UnionListWriter writer, double[] values) { } writer.endList(); } + + private void writeListViewVector(UnionListViewWriter writer, double[] values) { + writer.startListView(); + for (double v : values) { + writer.float8().writeFloat8(v); + } + writer.endListView(); + } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/compare/TestTypeEqualsVisitor.java b/java/vector/src/test/java/org/apache/arrow/vector/compare/TestTypeEqualsVisitor.java index 4cd3603e2071f..d65096205fd71 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/compare/TestTypeEqualsVisitor.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/compare/TestTypeEqualsVisitor.java @@ -33,6 +33,7 @@ import org.apache.arrow.vector.ViewVarCharVector; import org.apache.arrow.vector.complex.DenseUnionVector; import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.ListViewVector; import org.apache.arrow.vector.complex.StructVector; import org.apache.arrow.vector.complex.UnionVector; import org.apache.arrow.vector.types.Types; @@ -104,6 +105,22 @@ public void testListTypeEquals() { } } + @Test + public void testListViewTypeEquals() { + try (final ListViewVector right = ListViewVector.empty("listview", allocator); + final ListViewVector left1 = ListViewVector.empty("listview", allocator); + final ListViewVector left2 = ListViewVector.empty("listview", allocator)) { + + right.addOrGetVector(FieldType.nullable(new ArrowType.Utf8())); + left1.addOrGetVector(FieldType.nullable(new ArrowType.Utf8())); + left2.addOrGetVector(FieldType.nullable(new ArrowType.FixedSizeBinary(2))); + + TypeEqualsVisitor visitor = new TypeEqualsVisitor(right); + assertTrue(visitor.equals(left1)); + assertFalse(visitor.equals(left2)); + } + } + @Test public void testStructTypeEquals() { try (final StructVector right = StructVector.empty("struct", allocator); From 45b176716cc667384577a2a1218c6da454854109 Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Sat, 3 Aug 2024 09:23:06 +0800 Subject: [PATCH 64/73] GH-43414: [C++][Compute] Fix invalid memory access when resizing var-length buffer in row table (#43415) ### Rationale for this change As #43414 explained. The UT in PR reproduces this issue well (may need ASAN to detect). ### What changes are included in this PR? Check if `is_fixed_length` before treating the second buffer as offset. ### Are these changes tested? UT included. ### Are there any user-facing changes? None. * GitHub Issue: #43414 Lead-authored-by: Ruoxi Sun Co-authored-by: Felipe Oliveira Carvalho Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/compute/row/row_internal.cc | 8 +++- cpp/src/arrow/compute/row/row_internal.h | 7 ++++ cpp/src/arrow/compute/row/row_test.cc | 47 ++++++++++++++--------- 3 files changed, 41 insertions(+), 21 deletions(-) diff --git a/cpp/src/arrow/compute/row/row_internal.cc b/cpp/src/arrow/compute/row/row_internal.cc index 2365ef5632cce..746ed950ffa07 100644 --- a/cpp/src/arrow/compute/row/row_internal.cc +++ b/cpp/src/arrow/compute/row/row_internal.cc @@ -293,8 +293,10 @@ Status RowTableImpl::ResizeFixedLengthBuffers(int64_t num_extra_rows) { } Status RowTableImpl::ResizeOptionalVaryingLengthBuffer(int64_t num_extra_bytes) { + DCHECK(!metadata_.is_fixed_length); + int64_t num_bytes = offsets()[num_rows_]; - if (bytes_capacity_ >= num_bytes + num_extra_bytes || metadata_.is_fixed_length) { + if (bytes_capacity_ >= num_bytes + num_extra_bytes) { return Status::OK(); } @@ -397,7 +399,9 @@ Status RowTableImpl::AppendSelectionFrom(const RowTableImpl& from, Status RowTableImpl::AppendEmpty(uint32_t num_rows_to_append, uint32_t num_extra_bytes_to_append) { RETURN_NOT_OK(ResizeFixedLengthBuffers(num_rows_to_append)); - RETURN_NOT_OK(ResizeOptionalVaryingLengthBuffer(num_extra_bytes_to_append)); + if (!metadata_.is_fixed_length) { + RETURN_NOT_OK(ResizeOptionalVaryingLengthBuffer(num_extra_bytes_to_append)); + } num_rows_ += num_rows_to_append; if (metadata_.row_alignment > 1 || metadata_.string_alignment > 1) { memset(rows_->mutable_data(), 0, bytes_capacity_); diff --git a/cpp/src/arrow/compute/row/row_internal.h b/cpp/src/arrow/compute/row/row_internal.h index 80409f93d2b96..93818fb14d629 100644 --- a/cpp/src/arrow/compute/row/row_internal.h +++ b/cpp/src/arrow/compute/row/row_internal.h @@ -220,7 +220,14 @@ class ARROW_EXPORT RowTableImpl { } private: + /// \brief Resize the fixed length buffers to store `num_extra_rows` more rows. The + /// fixed length buffers are buffers_[0] for null masks, buffers_[1] for row data if the + /// row is fixed length, or for row offsets otherwise. Status ResizeFixedLengthBuffers(int64_t num_extra_rows); + + /// \brief Resize the optional varying length buffer to store `num_extra_bytes` more + /// bytes. + /// \pre !metadata_.is_fixed_length Status ResizeOptionalVaryingLengthBuffer(int64_t num_extra_bytes); // Helper functions to determine the number of bytes needed for each diff --git a/cpp/src/arrow/compute/row/row_test.cc b/cpp/src/arrow/compute/row/row_test.cc index 679ad519a9ef2..75f981fb1281d 100644 --- a/cpp/src/arrow/compute/row/row_test.cc +++ b/cpp/src/arrow/compute/row/row_test.cc @@ -69,9 +69,14 @@ TEST(RowTableMemoryConsumption, Encode) { constexpr int64_t num_rows_max = 8192; constexpr int64_t padding_for_vectors = 64; - ASSERT_OK_AND_ASSIGN( - auto fixed_length_column, - ::arrow::gen::Constant(std::make_shared(0))->Generate(num_rows_max)); + std::vector> fixed_length_columns; + for (const auto& dt : {int8(), uint16(), int32(), uint64(), fixed_size_binary(16), + fixed_size_binary(32)}) { + ASSERT_OK_AND_ASSIGN(auto fixed_length_column, + ::arrow::gen::Random(dt)->Generate(num_rows_max)); + fixed_length_columns.push_back(std::move(fixed_length_column)); + } + ASSERT_OK_AND_ASSIGN(auto var_length_column, ::arrow::gen::Constant(std::make_shared("X")) ->Generate(num_rows_max)); @@ -81,22 +86,26 @@ TEST(RowTableMemoryConsumption, Encode) { { SCOPED_TRACE("encoding fixed length column of " + std::to_string(num_rows) + " rows"); - ASSERT_OK_AND_ASSIGN(auto row_table, - MakeRowTableFromColumn(fixed_length_column, num_rows, - uint32()->byte_width(), 0)); - ASSERT_NE(row_table.data(0), NULLPTR); - ASSERT_NE(row_table.data(1), NULLPTR); - ASSERT_EQ(row_table.data(2), NULLPTR); - - int64_t actual_null_mask_size = - num_rows * row_table.metadata().null_masks_bytes_per_row; - ASSERT_LE(actual_null_mask_size, row_table.buffer_size(0) - padding_for_vectors); - ASSERT_GT(actual_null_mask_size * 2, - row_table.buffer_size(0) - padding_for_vectors); - - int64_t actual_rows_size = num_rows * uint32()->byte_width(); - ASSERT_LE(actual_rows_size, row_table.buffer_size(1) - padding_for_vectors); - ASSERT_GT(actual_rows_size * 2, row_table.buffer_size(1) - padding_for_vectors); + for (const auto& col : fixed_length_columns) { + const auto& dt = col->type(); + SCOPED_TRACE("encoding fixed length column of type " + dt->ToString()); + ASSERT_OK_AND_ASSIGN(auto row_table, + MakeRowTableFromColumn(col, num_rows, dt->byte_width(), + /*string_alignment=*/0)); + ASSERT_NE(row_table.data(0), NULLPTR); + ASSERT_NE(row_table.data(1), NULLPTR); + ASSERT_EQ(row_table.data(2), NULLPTR); + + int64_t actual_null_mask_size = + num_rows * row_table.metadata().null_masks_bytes_per_row; + ASSERT_LE(actual_null_mask_size, row_table.buffer_size(0) - padding_for_vectors); + ASSERT_GT(actual_null_mask_size * 2, + row_table.buffer_size(0) - padding_for_vectors); + + int64_t actual_rows_size = num_rows * dt->byte_width(); + ASSERT_LE(actual_rows_size, row_table.buffer_size(1) - padding_for_vectors); + ASSERT_GT(actual_rows_size * 2, row_table.buffer_size(1) - padding_for_vectors); + } } // Var length column. From 1992cc64c74367579fb587f0431bf4340401c38a Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sat, 3 Aug 2024 14:46:18 -0700 Subject: [PATCH 65/73] GH-43548: [R][CI] Use grep -F to simplify matching or rchk output (#43477) Passing along a minor change we made in our own mimeo of this GHA. Resolves #43548 * GitHub Issue: #43548 Authored-by: Michael Chirico Signed-off-by: Jonathan Keane --- dev/tasks/r/github.linux.rchk.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/tasks/r/github.linux.rchk.yml b/dev/tasks/r/github.linux.rchk.yml index a673304ff238d..65c17ae751290 100644 --- a/dev/tasks/r/github.linux.rchk.yml +++ b/dev/tasks/r/github.linux.rchk.yml @@ -54,7 +54,7 @@ jobs: # ERROR: too many states (abstraction error?)) # https://github.com/kalibera/rchk run: | - if [ $(grep -c "Suspicious call" rchk.out) -gt 0 ] || [ $(grep -c "\[UP\]" rchk.out) -gt 0 ] || [ $(grep -c "\[PB\]" rchk.out) -gt 0 ]; then + if [ $(grep -Fc "Suspicious call" rchk.out) -gt 0 ] || [ $(grep -Fc "[UP]" rchk.out) -gt 0 ] || [ $(grep -Fc "[PB]" rchk.out) -gt 0 ]; then echo "Found rchk errors" cat rchk.out exit 1 From 39af73f2ada90b2d66c1410f9c591b25544b711f Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sun, 4 Aug 2024 16:39:21 +0900 Subject: [PATCH 66/73] GH-41909: [C++] Add arrow::ArrayStatistics (#43273) ### Rationale for this change We're discussion API on the mailing list https://lists.apache.org/thread/kcpyq9npnh346pw90ljwbg0wxq6hwxxh and GH-41909. If we have `arrow::ArrayStatistics`, we can attach statistics read from Apache Parquet to `arrow::Array`s. This only includes `arrow::ArrayStatistics`. See GH-42133 how to use `arrow::ArrayStatitics` for Apache Parquet's statistics. ### What changes are included in this PR? This only adds `arrow::ArrayStatistics` and its tests. ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. * GitHub Issue: #41909 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- cpp/src/arrow/CMakeLists.txt | 2 + cpp/src/arrow/array/statistics.cc | 21 +++++ cpp/src/arrow/array/statistics.h | 76 ++++++++++++++++++ cpp/src/arrow/array/statistics_test.cc | 103 +++++++++++++++++++++++++ 4 files changed, 202 insertions(+) create mode 100644 cpp/src/arrow/array/statistics.cc create mode 100644 cpp/src/arrow/array/statistics.h create mode 100644 cpp/src/arrow/array/statistics_test.cc diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 6dc8358f502f5..9c66a58c54261 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -412,6 +412,7 @@ arrow_add_object_library(ARROW_ARRAY array/concatenate.cc array/data.cc array/diff.cc + array/statistics.cc array/util.cc array/validate.cc) @@ -1168,6 +1169,7 @@ add_arrow_test(array_test array/array_struct_test.cc array/array_union_test.cc array/array_view_test.cc + array/statistics_test.cc PRECOMPILED_HEADERS "$<$:arrow/testing/pch.h>") diff --git a/cpp/src/arrow/array/statistics.cc b/cpp/src/arrow/array/statistics.cc new file mode 100644 index 0000000000000..b661c9fbaffed --- /dev/null +++ b/cpp/src/arrow/array/statistics.cc @@ -0,0 +1,21 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This empty .cc file is for embedding not inlined symbols in +// arrow::ArrayStatistics into libarrow. + +#include "arrow/array/statistics.h" diff --git a/cpp/src/arrow/array/statistics.h b/cpp/src/arrow/array/statistics.h new file mode 100644 index 0000000000000..7357e27f41f5b --- /dev/null +++ b/cpp/src/arrow/array/statistics.h @@ -0,0 +1,76 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/util/float16.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \brief Statistics for an Array +/// +/// Apache Arrow format doesn't have statistics but data source such +/// as Apache Parquet may have statistics. Statistics associated with +/// data source can be read unified API via this class. +struct ARROW_EXPORT ArrayStatistics { + using ValueType = + std::variant; + + ArrayStatistics() = default; + ~ArrayStatistics() = default; + + /// \brief The number of null values, may not be set + std::optional null_count = std::nullopt; + + /// \brief The number of distinct values, may not be set + std::optional distinct_count = std::nullopt; + + /// \brief The minimum value, may not be set + std::optional min = std::nullopt; + + /// \brief Whether the minimum value is exact or not, may not be set + std::optional is_min_exact = std::nullopt; + + /// \brief The maximum value, may not be set + std::optional max = std::nullopt; + + /// \brief Whether the maximum value is exact or not, may not be set + std::optional is_max_exact = std::nullopt; + + /// \brief Check two statistics for equality + bool Equals(const ArrayStatistics& other) const { + return null_count == other.null_count && distinct_count == other.distinct_count && + min == other.min && is_min_exact == other.is_min_exact && max == other.max && + is_max_exact == other.is_max_exact; + } + + /// \brief Check two statistics for equality + bool operator==(const ArrayStatistics& other) const { return Equals(other); } + + /// \brief Check two statistics for not equality + bool operator!=(const ArrayStatistics& other) const { return !Equals(other); } +}; + +} // namespace arrow diff --git a/cpp/src/arrow/array/statistics_test.cc b/cpp/src/arrow/array/statistics_test.cc new file mode 100644 index 0000000000000..a465ac0bc2e0d --- /dev/null +++ b/cpp/src/arrow/array/statistics_test.cc @@ -0,0 +1,103 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/array/statistics.h" + +namespace arrow { + +TEST(ArrayStatisticsTest, TestNullCount) { + ArrayStatistics statistics; + ASSERT_FALSE(statistics.null_count.has_value()); + statistics.null_count = 29; + ASSERT_TRUE(statistics.null_count.has_value()); + ASSERT_EQ(29, statistics.null_count.value()); +} + +TEST(ArrayStatisticsTest, TestDistinctCount) { + ArrayStatistics statistics; + ASSERT_FALSE(statistics.distinct_count.has_value()); + statistics.distinct_count = 29; + ASSERT_TRUE(statistics.distinct_count.has_value()); + ASSERT_EQ(29, statistics.distinct_count.value()); +} + +TEST(ArrayStatisticsTest, TestMin) { + ArrayStatistics statistics; + ASSERT_FALSE(statistics.min.has_value()); + ASSERT_FALSE(statistics.is_min_exact.has_value()); + statistics.min = static_cast(29); + statistics.is_min_exact = true; + ASSERT_TRUE(statistics.min.has_value()); + ASSERT_TRUE(std::holds_alternative(statistics.min.value())); + ASSERT_EQ(29, std::get(statistics.min.value())); + ASSERT_TRUE(statistics.is_min_exact.has_value()); + ASSERT_TRUE(statistics.is_min_exact.value()); +} + +TEST(ArrayStatisticsTest, TestMax) { + ArrayStatistics statistics; + ASSERT_FALSE(statistics.max.has_value()); + ASSERT_FALSE(statistics.is_max_exact.has_value()); + statistics.max = std::string("hello"); + statistics.is_max_exact = false; + ASSERT_TRUE(statistics.max.has_value()); + ASSERT_TRUE(std::holds_alternative(statistics.max.value())); + ASSERT_EQ("hello", std::get(statistics.max.value())); + ASSERT_TRUE(statistics.is_max_exact.has_value()); + ASSERT_FALSE(statistics.is_max_exact.value()); +} + +TEST(ArrayStatisticsTest, TestEquality) { + ArrayStatistics statistics1; + ArrayStatistics statistics2; + + ASSERT_EQ(statistics1, statistics2); + + statistics1.null_count = 29; + ASSERT_NE(statistics1, statistics2); + statistics2.null_count = 29; + ASSERT_EQ(statistics1, statistics2); + + statistics1.distinct_count = 2929; + ASSERT_NE(statistics1, statistics2); + statistics2.distinct_count = 2929; + ASSERT_EQ(statistics1, statistics2); + + statistics1.min = std::string_view("world"); + ASSERT_NE(statistics1, statistics2); + statistics2.min = std::string_view("world"); + ASSERT_EQ(statistics1, statistics2); + + statistics1.is_min_exact = false; + ASSERT_NE(statistics1, statistics2); + statistics2.is_min_exact = false; + ASSERT_EQ(statistics1, statistics2); + + statistics1.max = arrow::util::Float16(-29); + ASSERT_NE(statistics1, statistics2); + statistics2.max = arrow::util::Float16(-29); + ASSERT_EQ(statistics1, statistics2); + + statistics1.is_max_exact = true; + ASSERT_NE(statistics1, statistics2); + statistics2.is_max_exact = true; + ASSERT_EQ(statistics1, statistics2); +} + +} // namespace arrow From bae2908b3142f1141941d740d42c5d403c01749f Mon Sep 17 00:00:00 2001 From: rene-hess Date: Sun, 4 Aug 2024 22:24:42 +0200 Subject: [PATCH 67/73] MINOR: [Go] Fix typo in documentation in go/arrow/table.go (#43550) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change I found a small typo in the documentation ### Are there any user-facing changes? No Authored-by: René Heß Signed-off-by: Sutou Kouhei --- go/arrow/table.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/arrow/table.go b/go/arrow/table.go index 5ad2aa08e9341..c7a13fc37d28c 100644 --- a/go/arrow/table.go +++ b/go/arrow/table.go @@ -49,7 +49,7 @@ type Table interface { // To get strongly typed data from a Column, you need to iterate the // chunks and type assert each individual Array. For example: // -// switch column.DataType().ID { +// switch column.DataType().ID() { // case arrow.INT32: // for _, c := range column.Data().Chunks() { // arr := c.(*array.Int32) From 66cb7495d1a43f3539cf66f6d88bac40fb9d28d4 Mon Sep 17 00:00:00 2001 From: mwish Date: Mon, 5 Aug 2024 16:28:53 +0800 Subject: [PATCH 68/73] GH-43382: [C++][Parquet] min-max Statistics doesn't work well when one of min-max is truncated (#43383) ### Rationale for this change See https://github.com/apache/arrow/issues/43382 ### What changes are included in this PR? Change stats has min-max from min || max to && ### Are these changes tested? * [x] TODO ### Are there any user-facing changes? Might affect interface using HasMinMax **This PR includes breaking changes to public APIs.** * GitHub Issue: #43382 Authored-by: mwish Signed-off-by: mwish --- .../parquet/arrow/arrow_statistics_test.cc | 23 +++++++++++++++++ cpp/src/parquet/metadata.cc | 4 +-- cpp/src/parquet/statistics.h | 2 +- cpp/src/parquet/statistics_test.cc | 25 +++++++++++++++++++ 4 files changed, 51 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_statistics_test.cc b/cpp/src/parquet/arrow/arrow_statistics_test.cc index ad4496933ef4c..a19303c3dc03a 100644 --- a/cpp/src/parquet/arrow/arrow_statistics_test.cc +++ b/cpp/src/parquet/arrow/arrow_statistics_test.cc @@ -156,4 +156,27 @@ INSTANTIATE_TEST_SUITE_P( /*expected_min=*/"z", /*expected_max=*/"z"})); +TEST(StatisticsTest, TruncateOnlyHalfMinMax) { + // GH-43382: Tests when we only have min or max, the `HasMinMax` should be false. + std::shared_ptr<::arrow::ResizableBuffer> serialized_data = AllocateBuffer(); + auto out_stream = std::make_shared<::arrow::io::BufferOutputStream>(serialized_data); + auto schema = ::arrow::schema({::arrow::field("a", ::arrow::utf8())}); + ::parquet::WriterProperties::Builder properties_builder; + properties_builder.max_statistics_size(2); + ASSERT_OK_AND_ASSIGN( + std::unique_ptr writer, + FileWriter::Open(*schema, default_memory_pool(), out_stream, + properties_builder.build(), default_arrow_writer_properties())); + auto table = Table::Make(schema, {ArrayFromJSON(::arrow::utf8(), R"(["a", "abc"])")}); + ASSERT_OK(writer->WriteTable(*table, std::numeric_limits::max())); + ASSERT_OK(writer->Close()); + ASSERT_OK(out_stream->Close()); + + auto buffer_reader = std::make_shared<::arrow::io::BufferReader>(serialized_data); + auto parquet_reader = ParquetFileReader::Open(std::move(buffer_reader)); + std::shared_ptr metadata = parquet_reader->metadata(); + std::shared_ptr stats = metadata->RowGroup(0)->ColumnChunk(0)->statistics(); + ASSERT_FALSE(stats->HasMinMax()); +} + } // namespace parquet::arrow diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 139793219df90..fe16f5b76bd09 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -97,7 +97,7 @@ static std::shared_ptr MakeTypedColumnStats( descr, metadata.statistics.min_value, metadata.statistics.max_value, metadata.num_values - metadata.statistics.null_count, metadata.statistics.null_count, metadata.statistics.distinct_count, - metadata.statistics.__isset.max_value || metadata.statistics.__isset.min_value, + metadata.statistics.__isset.max_value && metadata.statistics.__isset.min_value, metadata.statistics.__isset.null_count, metadata.statistics.__isset.distinct_count); } @@ -106,7 +106,7 @@ static std::shared_ptr MakeTypedColumnStats( descr, metadata.statistics.min, metadata.statistics.max, metadata.num_values - metadata.statistics.null_count, metadata.statistics.null_count, metadata.statistics.distinct_count, - metadata.statistics.__isset.max || metadata.statistics.__isset.min, + metadata.statistics.__isset.max && metadata.statistics.__isset.min, metadata.statistics.__isset.null_count, metadata.statistics.__isset.distinct_count); } diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h index 0d6ea9898f7ba..c5da44a7b6665 100644 --- a/cpp/src/parquet/statistics.h +++ b/cpp/src/parquet/statistics.h @@ -246,7 +246,7 @@ class PARQUET_EXPORT Statistics { /// \brief The number of non-null values in the column virtual int64_t num_values() const = 0; - /// \brief Return true if the min and max statistics are set. Obtain + /// \brief Return true if both min and max statistics are set. Obtain /// with TypedStatistics::min and max virtual bool HasMinMax() const = 0; diff --git a/cpp/src/parquet/statistics_test.cc b/cpp/src/parquet/statistics_test.cc index cb2e6455abfa9..dad414ac89b47 100644 --- a/cpp/src/parquet/statistics_test.cc +++ b/cpp/src/parquet/statistics_test.cc @@ -1602,5 +1602,30 @@ TEST(TestEncodedStatistics, CopySafe) { EXPECT_EQ("abc", encoded_statistics.max()); } +TEST(TestEncodedStatistics, ApplyStatSizeLimits) { + EncodedStatistics encoded_statistics; + encoded_statistics.set_min("a"); + encoded_statistics.has_min = true; + + encoded_statistics.set_max("abc"); + encoded_statistics.has_max = true; + + encoded_statistics.ApplyStatSizeLimits(2); + + ASSERT_TRUE(encoded_statistics.has_min); + ASSERT_EQ("a", encoded_statistics.min()); + ASSERT_FALSE(encoded_statistics.has_max); + + NodePtr node = + PrimitiveNode::Make("StringColumn", Repetition::REQUIRED, Type::BYTE_ARRAY); + ColumnDescriptor descr(node, 0, 0); + std::shared_ptr> statistics = + std::dynamic_pointer_cast>( + Statistics::Make(&descr, &encoded_statistics, + /*num_values=*/1000)); + // GH-43382: HasMinMax should be false if one of min/max is not set. + EXPECT_FALSE(statistics->HasMinMax()); +} + } // namespace test } // namespace parquet From 57106384d7179622e399938191bbbe72864b5a5a Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 6 Aug 2024 01:52:09 +0200 Subject: [PATCH 69/73] MINOR: [CI] Replace docker-compose with `docker compose` (#43566) "docker-compose" is not available on some CI images anymore, switch to the Docker built-in "compose" command. Authored-by: Antoine Pitrou Signed-off-by: Sutou Kouhei --- .github/workflows/cpp.yml | 2 +- dev/tasks/cpp-examples/github.linux.yml | 2 +- dev/tasks/python-minimal-build/github.linux.yml | 2 +- dev/tasks/r/azure.linux.yml | 8 ++++---- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index eff0b0204e6bd..fc7f3c5dded3c 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -189,7 +189,7 @@ jobs: - name: Run minimal example run: | cd cpp/examples/minimal_build - docker-compose run --rm minimal + docker compose run --rm minimal macos: name: ${{ matrix.architecture }} macOS ${{ matrix.macos-version }} C++ diff --git a/dev/tasks/cpp-examples/github.linux.yml b/dev/tasks/cpp-examples/github.linux.yml index fe26eb1e24e0e..febfee9ae1e60 100644 --- a/dev/tasks/cpp-examples/github.linux.yml +++ b/dev/tasks/cpp-examples/github.linux.yml @@ -30,4 +30,4 @@ jobs: shell: bash run: | cd arrow/cpp/examples/{{ type }} - docker-compose run --rm {{ run }} + docker compose run --rm {{ run }} diff --git a/dev/tasks/python-minimal-build/github.linux.yml b/dev/tasks/python-minimal-build/github.linux.yml index d97968b86b362..4216c5b1a2abf 100644 --- a/dev/tasks/python-minimal-build/github.linux.yml +++ b/dev/tasks/python-minimal-build/github.linux.yml @@ -31,4 +31,4 @@ jobs: - name: Run minimal build example run: | cd arrow/python/examples/{{ type }} - docker-compose run --rm {{ image }} + docker compose run --rm {{ image }} diff --git a/dev/tasks/r/azure.linux.yml b/dev/tasks/r/azure.linux.yml index 28893a81728c3..a74208ab2542f 100644 --- a/dev/tasks/r/azure.linux.yml +++ b/dev/tasks/r/azure.linux.yml @@ -33,14 +33,14 @@ jobs: - script: | set -ex docker -v - docker-compose -v + docker compose version cd arrow export R_ORG={{ r_org }} export R_IMAGE={{ r_image }} export R_TAG={{ r_tag }} export R_CUSTOM_CCACHE={{ r_custom_ccache|default("false") }} - docker-compose pull --ignore-pull-failures r - docker-compose build r + docker compose pull --ignore-pull-failures r + docker compose build r displayName: Docker build env: {{ macros.azure_set_sccache_envvars()|indent(4) }} @@ -54,7 +54,7 @@ jobs: export ARROW_R_DEV={{ not_cran|default("TRUE") }} # Note that by default, ci/scripts/r_test.sh sets NOT_CRAN=true # if ARROW_R_DEV=TRUE. Pass `-e NOT_CRAN=false` to turn that off. - docker-compose run {{ flags|default("") }} r + docker compose run {{ flags|default("") }} r displayName: Docker run env: {{ macros.azure_set_sccache_envvars()|indent(4) }} From 629b9e7708535085007f4de3c5d5d27bbd69bea4 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 6 Aug 2024 08:56:10 +0900 Subject: [PATCH 70/73] MINOR: [C#] Bump Google.Protobuf from 3.27.2 to 3.27.3 in /csharp (#43571) Bumps [Google.Protobuf](https://github.com/protocolbuffers/protobuf) from 3.27.2 to 3.27.3.
Commits
  • 7cc670c Updating version.json and repo version numbers to: 27.3
  • 67d7298 Merge pull request #17617 from protocolbuffers/cp-utf8-ascii
  • e20cb7a Remove /utf-8 flag added in #14197
  • c9839cb Merge pull request #17473 from protocolbuffers/cp-revert-hack
  • 8a579c1 Downgrade CMake to 3.29 to workaround Abseil issue.
  • ba3e7d7 Revert workaround for std::mutex issues on github windows runners.
  • 861be78 Merge pull request #17331 from protocolbuffers/cp-cp
  • c1ec82f Merge pull request #17232 from simonberger/bugfix/php-ext-persistent-global-c...
  • aec8a76 Upgrade macos-11 tests to macos-12
  • 4e3b4f0 Use explicit names of our large runners
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=Google.Protobuf&package-manager=nuget&previous-version=3.27.2&new-version=3.27.3)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj index c2081cbe1aa68..a46f0d9193556 100644 --- a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj +++ b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj @@ -5,7 +5,7 @@ - + From 9253c33bf93ec71d8bb539326507ac318a4a3ab3 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 6 Aug 2024 09:08:39 +0900 Subject: [PATCH 71/73] MINOR: [Go] Bump github.com/hamba/avro/v2 from 2.23.0 to 2.24.0 in /go (#43567) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [github.com/hamba/avro/v2](https://github.com/hamba/avro) from 2.23.0 to 2.24.0.
Release notes

Sourced from github.com/hamba/avro/v2's releases.

v2.24.0

What's Changed

New Contributors

Full Changelog: https://github.com/hamba/avro/compare/v2.23.0...v2.24.0

Commits
  • 4aff30f fix: decimal decoding into *big.Rat (#425)
  • 3fc81b6 Register Go int as long in type resolver (#423)
  • dfd5956 feat: support for Go int encoding/decoding into/from Avro long (#422)
  • f8a0492 feat: Improve zstd encoder and decoder usage in the compressor codec (#420)
  • See full diff in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=github.com/hamba/avro/v2&package-manager=go_modules&previous-version=2.23.0&new-version=2.24.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- go/go.mod | 2 +- go/go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go/go.mod b/go/go.mod index 43c2c41b69eca..af7c2b24f4a9e 100644 --- a/go/go.mod +++ b/go/go.mod @@ -47,7 +47,7 @@ require ( require ( github.com/google/uuid v1.6.0 - github.com/hamba/avro/v2 v2.23.0 + github.com/hamba/avro/v2 v2.24.0 github.com/huandu/xstrings v1.4.0 github.com/substrait-io/substrait-go v0.5.0 github.com/tidwall/sjson v1.2.5 diff --git a/go/go.sum b/go/go.sum index a96f0a3797c74..9ac913d5b9fa7 100644 --- a/go/go.sum +++ b/go/go.sum @@ -43,8 +43,8 @@ github.com/google/pprof v0.0.0-20221118152302-e6195bd50e26 h1:Xim43kblpZXfIBQsbu github.com/google/pprof v0.0.0-20221118152302-e6195bd50e26/go.mod h1:dDKJzRmX4S37WGHujM7tX//fmj1uioxKzKxz3lo4HJo= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/hamba/avro/v2 v2.23.0 h1:DYWz6UqNCi21JflaZlcwNfW+rK+D/CwnrWWJtfmO4vw= -github.com/hamba/avro/v2 v2.23.0/go.mod h1:7vDfy/2+kYCE8WUHoj2et59GTv0ap7ptktMXu0QHePI= +github.com/hamba/avro/v2 v2.24.0 h1:axTlaYDkcSY0dVekRSy8cdrsj5MG86WqosUQacKCids= +github.com/hamba/avro/v2 v2.24.0/go.mod h1:7vDfy/2+kYCE8WUHoj2et59GTv0ap7ptktMXu0QHePI= github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM= From e6b05a8e58ad5c20f6daa2fa64488b8efec51993 Mon Sep 17 00:00:00 2001 From: Jin Chengcheng Date: Tue, 6 Aug 2024 10:12:56 +0800 Subject: [PATCH 72/73] GH-43483: [Java][C++] Support more CsvFragmentScanOptions in JNI call (#43482) ### Rationale for this change Support more CSV fragment scan options ### What changes are included in this PR? Implement nearly all cpp code supported CsvFragmentScanOptions ### Are these changes tested? Yes, new added test and exists UT. ### Are there any user-facing changes? No. * GitHub Issue: #28866 * GitHub Issue: #43483 Authored-by: Chengcheng Jin Signed-off-by: David Li --- java/dataset/src/main/cpp/jni_wrapper.cc | 113 ++++++++++--- .../scanner/csv/CsvFragmentScanOptions.java | 4 + .../dataset/TestFragmentScanOptions.java | 156 ++++++++++++++++++ 3 files changed, 254 insertions(+), 19 deletions(-) diff --git a/java/dataset/src/main/cpp/jni_wrapper.cc b/java/dataset/src/main/cpp/jni_wrapper.cc index f324f87d6c301..63b8dd73f4720 100644 --- a/java/dataset/src/main/cpp/jni_wrapper.cc +++ b/java/dataset/src/main/cpp/jni_wrapper.cc @@ -368,29 +368,104 @@ std::shared_ptr LoadArrowBufferFromByteBuffer(JNIEnv* env, jobjec inline bool ParseBool(const std::string& value) { return value == "true" ? true : false; } +inline bool ParseChar(const std::string& key, const std::string& value) { + if (value.size() != 1) { + JniThrow("Option " + key + " should be a char, but is " + value); + } + return value.at(0); +} + /// \brief Construct FragmentScanOptions from config map #ifdef ARROW_CSV -arrow::Result> -ToCsvFragmentScanOptions(const std::unordered_map& configs) { + +bool SetCsvConvertOptions(arrow::csv::ConvertOptions& options, const std::string& key, + const std::string& value) { + if (key == "column_types") { + int64_t schema_address = std::stol(value); + ArrowSchema* c_schema = reinterpret_cast(schema_address); + auto schema = JniGetOrThrow(arrow::ImportSchema(c_schema)); + auto& column_types = options.column_types; + for (auto field : schema->fields()) { + column_types[field->name()] = field->type(); + } + } else if (key == "strings_can_be_null") { + options.strings_can_be_null = ParseBool(value); + } else if (key == "check_utf8") { + options.check_utf8 = ParseBool(value); + } else if (key == "null_values") { + options.null_values = {value}; + } else if (key == "true_values") { + options.true_values = {value}; + } else if (key == "false_values") { + options.false_values = {value}; + } else if (key == "quoted_strings_can_be_null") { + options.quoted_strings_can_be_null = ParseBool(value); + } else if (key == "auto_dict_encode") { + options.auto_dict_encode = ParseBool(value); + } else if (key == "auto_dict_max_cardinality") { + options.auto_dict_max_cardinality = std::stoi(value); + } else if (key == "decimal_point") { + options.decimal_point = ParseChar(key, value); + } else if (key == "include_missing_columns") { + options.include_missing_columns = ParseBool(value); + } else { + return false; + } + return true; +} + +bool SetCsvParseOptions(arrow::csv::ParseOptions& options, const std::string& key, + const std::string& value) { + if (key == "delimiter") { + options.delimiter = ParseChar(key, value); + } else if (key == "quoting") { + options.quoting = ParseBool(value); + } else if (key == "quote_char") { + options.quote_char = ParseChar(key, value); + } else if (key == "double_quote") { + options.double_quote = ParseBool(value); + } else if (key == "escaping") { + options.escaping = ParseBool(value); + } else if (key == "escape_char") { + options.escape_char = ParseChar(key, value); + } else if (key == "newlines_in_values") { + options.newlines_in_values = ParseBool(value); + } else if (key == "ignore_empty_lines") { + options.ignore_empty_lines = ParseBool(value); + } else { + return false; + } + return true; +} + +bool SetCsvReadOptions(arrow::csv::ReadOptions& options, const std::string& key, + const std::string& value) { + if (key == "use_threads") { + options.use_threads = ParseBool(value); + } else if (key == "block_size") { + options.block_size = std::stoi(value); + } else if (key == "skip_rows") { + options.skip_rows = std::stoi(value); + } else if (key == "skip_rows_after_names") { + options.skip_rows_after_names = std::stoi(value); + } else if (key == "autogenerate_column_names") { + options.autogenerate_column_names = ParseBool(value); + } else { + return false; + } + return true; +} + +std::shared_ptr ToCsvFragmentScanOptions( + const std::unordered_map& configs) { std::shared_ptr options = std::make_shared(); - for (auto const& [key, value] : configs) { - if (key == "delimiter") { - options->parse_options.delimiter = value.data()[0]; - } else if (key == "quoting") { - options->parse_options.quoting = ParseBool(value); - } else if (key == "column_types") { - int64_t schema_address = std::stol(value); - ArrowSchema* c_schema = reinterpret_cast(schema_address); - ARROW_ASSIGN_OR_RAISE(auto schema, arrow::ImportSchema(c_schema)); - auto& column_types = options->convert_options.column_types; - for (auto field : schema->fields()) { - column_types[field->name()] = field->type(); - } - } else if (key == "strings_can_be_null") { - options->convert_options.strings_can_be_null = ParseBool(value); - } else { - return arrow::Status::Invalid("Config " + key + " is not supported."); + for (const auto& [key, value] : configs) { + bool setValid = SetCsvParseOptions(options->parse_options, key, value) || + SetCsvConvertOptions(options->convert_options, key, value) || + SetCsvReadOptions(options->read_options, key, value); + if (!setValid) { + JniThrow("Config " + key + " is not supported."); } } return options; diff --git a/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/csv/CsvFragmentScanOptions.java b/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/csv/CsvFragmentScanOptions.java index 39271b5f063fb..dddc36d38714e 100644 --- a/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/csv/CsvFragmentScanOptions.java +++ b/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/csv/CsvFragmentScanOptions.java @@ -32,6 +32,10 @@ public class CsvFragmentScanOptions implements FragmentScanOptions { * CSV scan options, map to CPP struct CsvFragmentScanOptions. The key in config map is the field * name of mapping cpp struct * + *

Currently, multi-valued options (which are std::vector values in C++) only support having a + * single value set. For example, for the null_values option, only one string can be set as the + * null value. + * * @param convertOptions similar to CsvFragmentScanOptions#convert_options in CPP, the ArrowSchema * represents column_types, convert data option such as null value recognition. * @param readOptions similar to CsvFragmentScanOptions#read_options in CPP, specify how to read diff --git a/java/dataset/src/test/java/org/apache/arrow/dataset/TestFragmentScanOptions.java b/java/dataset/src/test/java/org/apache/arrow/dataset/TestFragmentScanOptions.java index 9787e8308e73e..d598190528811 100644 --- a/java/dataset/src/test/java/org/apache/arrow/dataset/TestFragmentScanOptions.java +++ b/java/dataset/src/test/java/org/apache/arrow/dataset/TestFragmentScanOptions.java @@ -18,10 +18,13 @@ import static org.hamcrest.MatcherAssert.assertThat; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; import com.google.common.collect.ImmutableMap; import java.util.Arrays; import java.util.Collections; +import java.util.Map; import java.util.Optional; import org.apache.arrow.c.ArrowSchema; import org.apache.arrow.c.CDataDictionaryProvider; @@ -42,6 +45,7 @@ import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.arrow.vector.util.Text; import org.hamcrest.collection.IsIterableContainingInOrder; import org.junit.jupiter.api.Test; @@ -165,4 +169,156 @@ public void testCsvConvertOptionsNoOption() throws Exception { assertEquals(3, rowCount); } } + + @Test + public void testCsvReadParseAndReadOptions() throws Exception { + final Schema schema = + new Schema( + Collections.singletonList(Field.nullable("Id;Name;Language", new ArrowType.Utf8())), + null); + String path = "file://" + getClass().getResource("/").getPath() + "/data/student.csv"; + BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); + CsvFragmentScanOptions fragmentScanOptions = + new CsvFragmentScanOptions( + new CsvConvertOptions(ImmutableMap.of()), + ImmutableMap.of("skip_rows_after_names", "1"), + ImmutableMap.of("delimiter", ";")); + ScanOptions options = + new ScanOptions.Builder(/*batchSize*/ 32768) + .columns(Optional.empty()) + .fragmentScanOptions(fragmentScanOptions) + .build(); + try (DatasetFactory datasetFactory = + new FileSystemDatasetFactory( + allocator, + NativeMemoryPool.getDefault(), + FileFormat.CSV, + path, + Optional.of(fragmentScanOptions)); + Dataset dataset = datasetFactory.finish(); + Scanner scanner = dataset.newScan(options); + ArrowReader reader = scanner.scanBatches()) { + + assertEquals(schema.getFields(), reader.getVectorSchemaRoot().getSchema().getFields()); + int rowCount = 0; + while (reader.loadNextBatch()) { + final ValueIterableVector idVector = + (ValueIterableVector) reader.getVectorSchemaRoot().getVector("Id;Name;Language"); + assertThat( + idVector.getValueIterable(), + IsIterableContainingInOrder.contains( + new Text("2;Peter;Python"), new Text("3;Celin;C++"))); + rowCount += reader.getVectorSchemaRoot().getRowCount(); + } + assertEquals(2, rowCount); + } + } + + @Test + public void testCsvReadOtherOptions() throws Exception { + String path = "file://" + getClass().getResource("/").getPath() + "/data/student.csv"; + BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); + Map convertOption = + ImmutableMap.of( + "check_utf8", + "true", + "null_values", + "NULL", + "true_values", + "True", + "false_values", + "False", + "quoted_strings_can_be_null", + "true", + "auto_dict_encode", + "false", + "auto_dict_max_cardinality", + "3456", + "decimal_point", + ".", + "include_missing_columns", + "false"); + Map readOption = + ImmutableMap.of( + "use_threads", + "true", + "block_size", + "1024", + "skip_rows", + "12", + "skip_rows_after_names", + "12", + "autogenerate_column_names", + "false"); + Map parseOption = + ImmutableMap.of( + "delimiter", + ".", + "quoting", + "true", + "quote_char", + "'", + "double_quote", + "False", + "escaping", + "true", + "escape_char", + "v", + "newlines_in_values", + "false", + "ignore_empty_lines", + "true"); + CsvFragmentScanOptions fragmentScanOptions = + new CsvFragmentScanOptions(new CsvConvertOptions(convertOption), readOption, parseOption); + ScanOptions options = + new ScanOptions.Builder(/*batchSize*/ 32768) + .columns(Optional.empty()) + .fragmentScanOptions(fragmentScanOptions) + .build(); + try (DatasetFactory datasetFactory = + new FileSystemDatasetFactory( + allocator, NativeMemoryPool.getDefault(), FileFormat.CSV, path); + Dataset dataset = datasetFactory.finish(); + Scanner scanner = dataset.newScan(options)) { + assertNotNull(scanner); + } + } + + @Test + public void testCsvInvalidOption() throws Exception { + String path = "file://" + getClass().getResource("/").getPath() + "/data/student.csv"; + BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); + Map convertOption = ImmutableMap.of("not_exists_key_check_utf8", "true"); + CsvFragmentScanOptions fragmentScanOptions = + new CsvFragmentScanOptions( + new CsvConvertOptions(convertOption), ImmutableMap.of(), ImmutableMap.of()); + ScanOptions options = + new ScanOptions.Builder(/*batchSize*/ 32768) + .columns(Optional.empty()) + .fragmentScanOptions(fragmentScanOptions) + .build(); + try (DatasetFactory datasetFactory = + new FileSystemDatasetFactory( + allocator, NativeMemoryPool.getDefault(), FileFormat.CSV, path); + Dataset dataset = datasetFactory.finish()) { + assertThrows(RuntimeException.class, () -> dataset.newScan(options)); + } + + CsvFragmentScanOptions fragmentScanOptionsFaultValue = + new CsvFragmentScanOptions( + new CsvConvertOptions(ImmutableMap.of()), + ImmutableMap.of("", ""), + ImmutableMap.of("escape_char", "vbvb")); + ScanOptions optionsFault = + new ScanOptions.Builder(/*batchSize*/ 32768) + .columns(Optional.empty()) + .fragmentScanOptions(fragmentScanOptionsFaultValue) + .build(); + try (DatasetFactory datasetFactory = + new FileSystemDatasetFactory( + allocator, NativeMemoryPool.getDefault(), FileFormat.CSV, path); + Dataset dataset = datasetFactory.finish()) { + assertThrows(RuntimeException.class, () -> dataset.newScan(optionsFault)); + } + } } From 51d50d750012d3ca04127f6723c4f1e69ff4f5dc Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 6 Aug 2024 09:41:19 +0200 Subject: [PATCH 73/73] GH-43559: [Python][CI] Add a Crossbow job with a debug CPython interpreter (#43565) ### Rationale for this change Debug builds of CPython help catch low-level errors when using the Python C API. This is illustrated in GH-43487: a debug build of CPython detected that we were incref'ing a Python object without holding the GIL (which is a race condition otherwise). ### What changes are included in this PR? 1. Add a Docker build with a conda-installed debug interpreter. 2. Add a Crossbow job to run said Docker build with Python 3.12. ### Are these changes tested? Yes, by the adding Crossbow job. The job now fails with a crash in `test_udf.py`, because of GH-43487. ### Are there any user-facing changes? No. * GitHub Issue: #43559 Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- .../conda-python-cpython-debug.dockerfile | 28 +++++++++++++++++++ dev/tasks/tasks.yml | 8 ++++++ docker-compose.yml | 25 +++++++++++++++++ 3 files changed, 61 insertions(+) create mode 100644 ci/docker/conda-python-cpython-debug.dockerfile diff --git a/ci/docker/conda-python-cpython-debug.dockerfile b/ci/docker/conda-python-cpython-debug.dockerfile new file mode 100644 index 0000000000000..87bdcafe4092a --- /dev/null +++ b/ci/docker/conda-python-cpython-debug.dockerfile @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +ARG repo +ARG arch +ARG python=3.8 +FROM ${repo}:${arch}-conda-python-${python} + +# (Docker oddity: ARG needs to be repeated after FROM) +ARG python=3.8 +RUN mamba install -y "conda-forge/label/python_debug::python=${python}[build=*_cpython]" && \ + mamba clean --all +# Quick check that we do have a debug mode CPython +RUN python -c "import sys; sys.gettotalrefcount()" diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 3b00bc0040bd1..07a4d638f1291 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1205,6 +1205,14 @@ tasks: image: conda-python {% endfor %} + test-conda-python-3.12-cpython-debug: + ci: github + template: docker-tests/github.linux.yml + params: + env: + PYTHON: 3.12 + image: conda-python-cpython-debug + test-conda-python-emscripten: ci: github template: docker-tests/github.linux.yml diff --git a/docker-compose.yml b/docker-compose.yml index cf22324f7cfb4..daa5c74bcb969 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -119,6 +119,7 @@ x-hierarchy: - conda-python: - conda-python-pandas: - conda-python-docs + - conda-python-cpython-debug - conda-python-cython2 - conda-python-dask - conda-python-emscripten @@ -1440,6 +1441,30 @@ services: volumes: *conda-volumes command: *python-conda-command + conda-python-cpython-debug: + # Usage: + # docker-compose build conda + # docker-compose build conda-cpp + # docker-compose build conda-python + # docker-compose build conda-python-cpython-debug + # docker-compose run --rm conda-python-cpython-debug + image: ${REPO}:${ARCH}-conda-python-${PYTHON}-cpython-debug + build: + context: . + dockerfile: ci/docker/conda-python-cpython-debug.dockerfile + cache_from: + - ${REPO}:${ARCH}-conda-python-${PYTHON}-cpython-debug + args: + repo: ${REPO} + arch: ${ARCH} + python: ${PYTHON} + shm_size: *shm-size + environment: + <<: [*common, *ccache] + PYTEST_ARGS: # inherit + volumes: *conda-volumes + command: *python-conda-command + ################################## R ######################################## ubuntu-r: