From 6e61c5e2163c8509411143752afc7f3bb37184cb Mon Sep 17 00:00:00 2001 From: Hendrik Makait Date: Thu, 7 Dec 2023 14:18:06 +0100 Subject: [PATCH 001/570] GH-39096: [Python] Release GIL in `.nbytes` (#39097) ### Rationale for this change The `.nbytes` holds the GIL while computing the data size in C++, which has caused performance issues in Dask because threads were blocking each other See #39096 ### Are these changes tested? I am not sure if additional tests are necessary here. If so, I'm happy to add them but would welcome some pointers. ### Are there any user-facing changes? No * Closes: #39096 Authored-by: Hendrik Makait Signed-off-by: Joris Van den Bossche --- python/pyarrow/array.pxi | 5 +++-- python/pyarrow/table.pxi | 15 +++++++++------ 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 9d62bed51f4a4..789e30d3e9b00 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1206,8 +1206,9 @@ cdef class Array(_PandasConvertible): cdef: CResult[int64_t] c_size_res - c_size_res = ReferencedBufferSize(deref(self.ap)) - size = GetResultValue(c_size_res) + with nogil: + c_size_res = ReferencedBufferSize(deref(self.ap)) + size = GetResultValue(c_size_res) return size def get_total_buffer_size(self): diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index f93f5950902c7..2f8d1abd1f085 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -248,8 +248,9 @@ cdef class ChunkedArray(_PandasConvertible): cdef: CResult[int64_t] c_res_buffer - c_res_buffer = ReferencedBufferSize(deref(self.chunked_array)) - size = GetResultValue(c_res_buffer) + with nogil: + c_res_buffer = ReferencedBufferSize(deref(self.chunked_array)) + size = GetResultValue(c_res_buffer) return size def get_total_buffer_size(self): @@ -2386,8 +2387,9 @@ cdef class RecordBatch(_Tabular): cdef: CResult[int64_t] c_res_buffer - c_res_buffer = ReferencedBufferSize(deref(self.batch)) - size = GetResultValue(c_res_buffer) + with nogil: + c_res_buffer = ReferencedBufferSize(deref(self.batch)) + size = GetResultValue(c_res_buffer) return size def get_total_buffer_size(self): @@ -4337,8 +4339,9 @@ cdef class Table(_Tabular): cdef: CResult[int64_t] c_res_buffer - c_res_buffer = ReferencedBufferSize(deref(self.table)) - size = GetResultValue(c_res_buffer) + with nogil: + c_res_buffer = ReferencedBufferSize(deref(self.table)) + size = GetResultValue(c_res_buffer) return size def get_total_buffer_size(self): From 1b634e7d274cf42089d1ab237905a550de36c260 Mon Sep 17 00:00:00 2001 From: David Li Date: Thu, 7 Dec 2023 08:35:34 -0500 Subject: [PATCH 002/570] GH-37061: [Docs][Format] Clarify semantics of GetSchema in FSQL (#38549) ### Rationale for this change Schemas of result sets and bind parameters are ambiguous in a few cases when they interact. ### What changes are included in this PR? Add documentation clarifying the expected behavior. ### Are these changes tested? N/A ### Are there any user-facing changes? No * Closes: #37061 Authored-by: David Li Signed-off-by: David Li --- docs/source/format/FlightSql.rst | 21 +++++++++++++++++++++ format/FlightSql.proto | 10 ++++++++-- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/docs/source/format/FlightSql.rst b/docs/source/format/FlightSql.rst index f7521c3876493..add044c2d3621 100644 --- a/docs/source/format/FlightSql.rst +++ b/docs/source/format/FlightSql.rst @@ -120,6 +120,23 @@ the ``type`` should be ``ClosePreparedStatement``). ``ActionCreatePreparedStatementRequest`` Create a new prepared statement for a SQL query. + The response will contain an opaque handle used to identify the + prepared statement. It may also contain two optional schemas: the + Arrow schema of the result set, and the Arrow schema of the bind + parameters (if any). Because the schema of the result set may + depend on the bind parameters, the schemas may not necessarily be + provided here as a result, or if provided, they may not be accurate. + Clients should not assume the schema provided here will be the + schema of any data actually returned by executing the prepared + statement. + + Some statements may have bind parameters without any specific type. + (As a trivial example for SQL, consider ``SELECT ?``.) It is + not currently specified how this should be handled in the bind + parameter schema above. We suggest either using a union type to + enumerate the possible types, or using the NA (null) type as a + wildcard/placeholder. + ``CommandPreparedStatementQuery`` Execute a previously created prepared statement and get the results. @@ -128,6 +145,10 @@ the ``type`` should be ``ClosePreparedStatement``). When used with GetFlightInfo: execute the prepared statement. The prepared statement can be reused after fetching results. + When used with GetSchema: get the expected Arrow schema of the + result set. If the client has bound parameter values with DoPut + previously, the server should take those values into account. + ``CommandPreparedStatementUpdate`` Execute a previously created prepared statement that does not return results. diff --git a/format/FlightSql.proto b/format/FlightSql.proto index 9b5968e5306f0..581cf1f76d57c 100644 --- a/format/FlightSql.proto +++ b/format/FlightSql.proto @@ -1537,11 +1537,14 @@ message ActionCreatePreparedStatementResult { bytes prepared_statement_handle = 1; // If a result set generating query was provided, dataset_schema contains the - // schema of the dataset as described in Schema.fbs::Schema, it is serialized as an IPC message. + // schema of the result set. It should be an IPC-encapsulated Schema, as described in Schema.fbs. + // For some queries, the schema of the results may depend on the schema of the parameters. The server + // should provide its best guess as to the schema at this point. Clients must not assume that this + // schema, if provided, will be accurate. bytes dataset_schema = 2; // If the query provided contained parameters, parameter_schema contains the - // schema of the expected parameters as described in Schema.fbs::Schema, it is serialized as an IPC message. + // schema of the expected parameters. It should be an IPC-encapsulated Schema, as described in Schema.fbs. bytes parameter_schema = 3; } @@ -1743,6 +1746,9 @@ message TicketStatementQuery { * - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case-sensitive, "0" otherwise. * - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. * - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. + * + * If the schema is retrieved after parameter values have been bound with DoPut, then the server should account + * for the parameters when determining the schema. * - DoPut: bind parameter values. All of the bound parameter sets will be executed as a single atomic execution. * - GetFlightInfo: execute the prepared statement instance. */ From f2fb8fffae91c0a909fe219b7948f3dd0e73db83 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Thu, 7 Dec 2023 12:18:30 -0500 Subject: [PATCH 003/570] GH-38928: [R] Fix spelling (#38929) ### Rationale for this change ### What changes are included in this PR? Spelling fixes to r/ ### Are these changes tested? ### Are there any user-facing changes? * Closes: #38928 Authored-by: Josh Soref <2119212+jsoref@users.noreply.github.com> Signed-off-by: Jacob Wujciak-Jens --- r/NEWS.md | 10 +++++----- r/R/arrow-object.R | 2 +- r/R/arrow-package.R | 2 +- r/R/compression.R | 2 +- r/R/config.R | 2 +- r/R/csv.R | 2 +- r/R/dataset.R | 2 +- r/R/dplyr-count.R | 2 +- r/R/dplyr-filter.R | 10 +++++----- r/R/dplyr-funcs-augmented.R | 2 +- r/R/dplyr-funcs-conditional.R | 2 +- r/R/dplyr-funcs-datetime.R | 4 ++-- r/R/dplyr-funcs-string.R | 2 +- r/R/dplyr-funcs-type.R | 4 ++-- r/R/duckdb.R | 2 +- r/R/extension.R | 6 +++--- r/R/feather.R | 4 ++-- r/R/filesystem.R | 2 +- r/R/parquet.R | 2 +- r/R/udf.R | 2 +- r/configure | 2 +- r/man/ExtensionType.Rd | 2 +- r/man/FileSystem.Rd | 2 +- r/man/add_filename.Rd | 2 +- r/man/codec_is_available.Rd | 2 +- r/man/io_thread_count.Rd | 2 +- r/man/new_extension_type.Rd | 2 +- r/man/open_dataset.Rd | 2 +- r/man/read_delim_arrow.Rd | 2 +- r/man/write_feather.Rd | 2 +- r/man/write_parquet.Rd | 2 +- r/src/altrep.cpp | 2 +- r/src/safe-call-into-r.h | 6 +++--- r/tests/testthat/helper-arrow.R | 2 +- r/tests/testthat/helper-skip.R | 4 ++-- r/tests/testthat/test-Array.R | 6 +++--- r/tests/testthat/test-backwards-compatibility.R | 2 +- r/tests/testthat/test-dataset-write.R | 4 ++-- r/tests/testthat/test-dplyr-funcs-datetime.R | 12 ++++++------ r/tests/testthat/test-dplyr-summarize.R | 6 +++--- r/tests/testthat/test-extension.R | 4 ++-- r/tools/nixlibs.R | 4 ++-- r/tools/update-checksums.R | 2 +- r/vignettes/arrow.Rmd | 2 +- r/vignettes/data_objects.Rmd | 2 +- r/vignettes/data_types.Rmd | 2 +- r/vignettes/data_wrangling.Rmd | 2 +- r/vignettes/developers/setup.Rmd | 6 +++--- r/vignettes/fs.Rmd | 4 ++-- r/vignettes/install.Rmd | 6 +++--- r/vignettes/read_write.Rmd | 2 +- 51 files changed, 84 insertions(+), 84 deletions(-) diff --git a/r/NEWS.md b/r/NEWS.md index 8c8852e9c86b9..8515facdff871 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -80,10 +80,10 @@ ## Installation -* MacOS builds now use the same installation pathway as on Linux (@assignUser, +* macOS builds now use the same installation pathway as on Linux (@assignUser, #37684). * A warning message is now issued on package load when running under emulation - on MacOS (i.e., use of x86 installation of R on M1/aarch64; #37777). + on macOS (i.e., use of x86 installation of R on M1/aarch64; #37777). * R scripts that run during configuration and installation are now run using the correct R interpreter (@meztez, #37225). * Failed libarrow builds now return more detailed output (@amoeba, #37727). @@ -416,7 +416,7 @@ As of version 10.0.0, `arrow` requires C++17 to build. This means that: * The `arrow.dev_repo` for nightly builds of the R package and prebuilt libarrow binaries is now . -* Brotli and BZ2 are shipped with MacOS binaries. BZ2 is shipped with Windows binaries. (#13484) +* Brotli and BZ2 are shipped with macOS binaries. BZ2 is shipped with Windows binaries. (#13484) # arrow 8.0.0 @@ -549,7 +549,7 @@ Arrow arrays and tables can be easily concatenated: ## Other improvements and fixes * Many of the vignettes have been reorganized, restructured and expanded to improve their usefulness and clarity. -* Code to generate schemas (and individual data type specficiations) are accessible with the `$code()` method on a `schema` or `type`. This allows you to easily get the code needed to create a schema from an object that already has one. +* Code to generate schemas (and individual data type specifications) are accessible with the `$code()` method on a `schema` or `type`. This allows you to easily get the code needed to create a schema from an object that already has one. * Arrow `Duration` type has been mapped to R's `difftime` class. * The `decimal256()` type is supported. The `decimal()` function has been revised to call either `decimal256()` or `decimal128()` based on the value of the `precision` argument. * `write_parquet()` uses a reasonable guess at `chunk_size` instead of always writing a single chunk. This improves the speed of reading and writing large Parquet files. @@ -824,7 +824,7 @@ to send and receive data. See `vignette("flight", package = "arrow")` for an ove * `arrow` now depends on [`cpp11`](https://cpp11.r-lib.org/), which brings more robust UTF-8 handling and faster compilation * The Linux build script now succeeds on older versions of R -* MacOS binary packages now ship with zstandard compression enabled +* macOS binary packages now ship with zstandard compression enabled ## Bug fixes and other enhancements diff --git a/r/R/arrow-object.R b/r/R/arrow-object.R index 5c2cf4691fc9c..b66c39dce957e 100644 --- a/r/R/arrow-object.R +++ b/r/R/arrow-object.R @@ -56,7 +56,7 @@ ArrowObject <- R6Class("ArrowObject", # Return NULL, because keeping this R6 object in scope is not a good idea. # This syntax would allow the rare use that has to actually do this to # do `object <- object$.unsafe_delete()` and reduce the chance that an - # IDE like RStudio will try try to call other methods which will error + # IDE like RStudio will try to call other methods which will error invisible(NULL) } ) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 1f39a50744abc..54e237192e080 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -183,7 +183,7 @@ configure_tzdb <- function() { # Just to be extra safe, let's wrap this in a try(); # we don't want a failed startup message to prevent the package from loading try({ - # On MacOS only, Check if we are running in under emulation, and warn this will not work + # On macOS only, Check if we are running in under emulation, and warn this will not work if (on_rosetta()) { packageStartupMessage( paste( diff --git a/r/R/compression.R b/r/R/compression.R index 8d28fbefd7b3d..3fe00a756987c 100644 --- a/r/R/compression.R +++ b/r/R/compression.R @@ -61,7 +61,7 @@ Codec$create <- function(type = "gzip", compression_level = NA) { #' the Arrow C++ library. This function lets you know which are available for #' use. #' @param type A string, one of "uncompressed", "snappy", "gzip", "brotli", -#' "zstd", "lz4", "lzo", or "bz2", case insensitive. +#' "zstd", "lz4", "lzo", or "bz2", case-insensitive. #' @return Logical: is `type` available? #' @export #' @examples diff --git a/r/R/config.R b/r/R/config.R index bd00afe1be631..941d74e59a90d 100644 --- a/r/R/config.R +++ b/r/R/config.R @@ -40,7 +40,7 @@ io_thread_count <- function() { #' @rdname io_thread_count #' @param num_threads integer: New number of threads for thread pool. At least -#' two threads are reccomended to support all operations in the arrow +#' two threads are recommended to support all operations in the arrow #' package. #' @export set_io_thread_count <- function(num_threads) { diff --git a/r/R/csv.R b/r/R/csv.R index a024c4531e748..03540006ca0a2 100644 --- a/r/R/csv.R +++ b/r/R/csv.R @@ -76,7 +76,7 @@ #' #' Note that if you are specifying column names, whether by `schema` or #' `col_names`, and the CSV file has a header row that would otherwise be used -#' to idenfity column names, you'll need to add `skip = 1` to skip that row. +#' to identify column names, you'll need to add `skip = 1` to skip that row. #' #' @param file A character file name or URI, literal data (either a single string or a [raw] vector), #' an Arrow input stream, or a `FileSystem` with path (`SubTreeFileSystem`). diff --git a/r/R/dataset.R b/r/R/dataset.R index 682f6c1481b4f..08189f1b290a2 100644 --- a/r/R/dataset.R +++ b/r/R/dataset.R @@ -46,7 +46,7 @@ #' #' The default behavior in `open_dataset()` is to inspect the file paths #' contained in the provided directory, and if they look like Hive-style, parse -#' them as Hive. If your dataset has Hive-style partioning in the file paths, +#' them as Hive. If your dataset has Hive-style partitioning in the file paths, #' you do not need to provide anything in the `partitioning` argument to #' `open_dataset()` to use them. If you do provide a character vector of #' partition column names, they will be ignored if they match what is detected, diff --git a/r/R/dplyr-count.R b/r/R/dplyr-count.R index ee713030b262e..df585a6cf0111 100644 --- a/r/R/dplyr-count.R +++ b/r/R/dplyr-count.R @@ -56,7 +56,7 @@ tally.arrow_dplyr_query <- function(x, wt = NULL, sort = FALSE, name = NULL) { tally.Dataset <- tally.ArrowTabular <- tally.RecordBatchReader <- tally.arrow_dplyr_query -# we don't want to depend on dplyr, but we refrence these above +# we don't want to depend on dplyr, but we reference these above utils::globalVariables(c("n", "desc")) check_n_name <- function(name, diff --git a/r/R/dplyr-filter.R b/r/R/dplyr-filter.R index c14c67e70168c..d85fa16af2e71 100644 --- a/r/R/dplyr-filter.R +++ b/r/R/dplyr-filter.R @@ -28,20 +28,20 @@ filter.arrow_dplyr_query <- function(.data, ..., .by = NULL, .preserve = FALSE) out$group_by_vars <- by$names } - filts <- expand_across(out, quos(...)) - if (length(filts) == 0) { + expanded_filters <- expand_across(out, quos(...)) + if (length(expanded_filters) == 0) { # Nothing to do return(as_adq(.data)) } # tidy-eval the filter expressions inside an Arrow data_mask - filters <- lapply(filts, arrow_eval, arrow_mask(out)) + filters <- lapply(expanded_filters, arrow_eval, arrow_mask(out)) bad_filters <- map_lgl(filters, ~ inherits(., "try-error")) if (any(bad_filters)) { # This is similar to abandon_ship() except that the filter eval is # vectorized, and we apply filters that _did_ work before abandoning ship # with the rest - expr_labs <- map_chr(filts[bad_filters], format_expr) + expr_labs <- map_chr(expanded_filters[bad_filters], format_expr) if (query_on_dataset(out)) { # Abort. We don't want to auto-collect if this is a Dataset because that # could blow up, too big. @@ -71,7 +71,7 @@ filter.arrow_dplyr_query <- function(.data, ..., .by = NULL, .preserve = FALSE) if (by$from_by) { out <- dplyr::ungroup(out) } - return(dplyr::filter(out, !!!filts[bad_filters], .by = {{ .by }})) + return(dplyr::filter(out, !!!expanded_filters[bad_filters], .by = {{ .by }})) } } diff --git a/r/R/dplyr-funcs-augmented.R b/r/R/dplyr-funcs-augmented.R index 116248d2dd92a..dca5ca16fa437 100644 --- a/r/R/dplyr-funcs-augmented.R +++ b/r/R/dplyr-funcs-augmented.R @@ -18,7 +18,7 @@ #' Add the data filename as a column #' #' This function only exists inside `arrow` `dplyr` queries, and it only is -#' valid when quering on a `FileSystemDataset`. +#' valid when querying on a `FileSystemDataset`. #' #' To use filenames generated by this function in subsequent pipeline steps, you #' must either call \code{\link[dplyr:compute]{compute()}} or diff --git a/r/R/dplyr-funcs-conditional.R b/r/R/dplyr-funcs-conditional.R index cd0245eeee182..b9639f00295ce 100644 --- a/r/R/dplyr-funcs-conditional.R +++ b/r/R/dplyr-funcs-conditional.R @@ -55,7 +55,7 @@ register_bindings_conditional <- function() { } if (last_arg && arg$type_id() %in% TYPES_WITH_NAN) { - # store the NA_real_ in the same type as arg to avoid avoid casting + # store the NA_real_ in the same type as arg to avoid casting # smaller float types to larger float types NA_expr <- Expression$scalar(Scalar$create(NA_real_, type = arg$type())) Expression$create("if_else", Expression$create("is_nan", arg), NA_expr, arg) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 5b6e16d376554..440210afd630c 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -459,7 +459,7 @@ register_bindings_datetime_timezone <- function() { roll_dst[1], "error" = 0L, "boundary" = 2L, - arrow_not_supported("`roll_dst` value must be 'error' or 'boundary' for non-existent times; other values") + arrow_not_supported("`roll_dst` value must be 'error' or 'boundary' for nonexistent times; other values") ) ambiguous <- switch( @@ -467,7 +467,7 @@ register_bindings_datetime_timezone <- function() { "error" = 0L, "pre" = 1L, "post" = 2L, - arrow_not_supported("`roll_dst` value must be 'error', 'pre', or 'post' for non-existent times") + arrow_not_supported("`roll_dst` value must be 'error', 'pre', or 'post' for nonexistent times") ) if (identical(tzone, "")) { diff --git a/r/R/dplyr-funcs-string.R b/r/R/dplyr-funcs-string.R index 3cd8f94476e5e..9f3220e557f08 100644 --- a/r/R/dplyr-funcs-string.R +++ b/r/R/dplyr-funcs-string.R @@ -516,7 +516,7 @@ register_bindings_string_other <- function() { msg = "`stop` must be length 1 - other lengths are not supported in Arrow" ) - # substr treats values as if they're on a continous number line, so values + # substr treats values as if they're on a continuous number line, so values # 0 are effectively blank characters - set `start` to 1 here so Arrow mimics # this behavior if (start <= 0) { diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 0bd340d4be2dd..f244682737cb4 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -158,8 +158,8 @@ register_bindings_type_cast <- function() { if (identical(fix.empty.names, TRUE)) { names(args) <- make.names(names(args), unique = TRUE) } else { - name_emtpy <- names(args) == "" - names(args)[!name_emtpy] <- make.names(names(args)[!name_emtpy], unique = TRUE) + name_empty <- names(args) == "" + names(args)[!name_empty] <- make.names(names(args)[!name_empty], unique = TRUE) } } diff --git a/r/R/duckdb.R b/r/R/duckdb.R index bf3a57daf2f1e..9632e9bad1984 100644 --- a/r/R/duckdb.R +++ b/r/R/duckdb.R @@ -89,7 +89,7 @@ arrow_duck_connection <- function() { # but if we don't explicitly run dbDisconnect() the user gets a warning # that they may not expect (since they did not open a duckdb connection). # This bit of code will run when the package namespace is cleaned up (i.e., - # at exit). This is more reliable than .onUnload() or .onDetatch(), which + # at exit). This is more reliable than .onUnload() or .onDetach(), which # don't necessarily run on exit. reg.finalizer(arrow_duck_finalizer, function(...) { con <- getOption("arrow_duck_con") diff --git a/r/R/extension.R b/r/R/extension.R index 4419c8ba01642..59a02121fd18c 100644 --- a/r/R/extension.R +++ b/r/R/extension.R @@ -83,7 +83,7 @@ ExtensionArray$create <- function(x, type) { #' - `$WrapArray(array)`: Wraps a storage [Array] into an [ExtensionArray] #' with this extension type. #' -#' In addition, subclasses may override the following methos to customize +#' In addition, subclasses may override the following methods to customize #' the behaviour of extension classes. #' #' - `$deserialize_instance()`: This method is called when a new [ExtensionType] @@ -184,7 +184,7 @@ ExtensionType <- R6Class("ExtensionType", }, ToString = function() { # metadata is probably valid UTF-8 (e.g., JSON), but might not be - # and it's confusing to error when printing the object. This herustic + # and it's confusing to error when printing the object. This heuristic # isn't perfect (but subclasses should override this method anyway) metadata_raw <- self$extension_metadata() @@ -286,7 +286,7 @@ ExtensionType$create <- function(storage_type, #' "dot" syntax (i.e., "some_package.some_type"). The namespace "arrow" #' is reserved for extension types defined by the Apache Arrow libraries. #' @param extension_metadata A [raw()] or [character()] vector containing the -#' serialized version of the type. Chatacter vectors must be length 1 and +#' serialized version of the type. Character vectors must be length 1 and #' are converted to UTF-8 before converting to [raw()]. #' @param type_class An [R6::R6Class] whose `$new()` class method will be #' used to construct a new instance of the type. diff --git a/r/R/feather.R b/r/R/feather.R index 3e390018c825f..474fc6118e44f 100644 --- a/r/R/feather.R +++ b/r/R/feather.R @@ -24,7 +24,7 @@ #' a legacy version available starting in 2016, and the Version 2 (V2), #' which is the Apache Arrow IPC file format. #' The default version is V2. -#' V1 files are distinct from Arrow IPC files and lack many feathures, +#' V1 files are distinct from Arrow IPC files and lack many features, #' such as the ability to store all Arrow data tyeps, and compression support. #' [write_ipc_file()] can only write V2 files. #' @@ -91,7 +91,7 @@ write_feather <- function(x, } } if (is.null(compression_level)) { - # Use -1 as sentinal for "default" + # Use -1 as sentinel for "default" compression_level <- -1L } compression_level <- as.integer(compression_level) diff --git a/r/R/filesystem.R b/r/R/filesystem.R index e0f370ad601b3..c6f92cba1932c 100644 --- a/r/R/filesystem.R +++ b/r/R/filesystem.R @@ -156,7 +156,7 @@ FileSelector$create <- function(base_dir, allow_not_found = FALSE, recursive = F #' buckets if `$CreateDir()` is called on the bucket level (default `FALSE`). #' - `allow_bucket_deletion`: logical, if TRUE, the filesystem will delete #' buckets if`$DeleteDir()` is called on the bucket level (default `FALSE`). -#' - `request_timeout`: Socket read time on Windows and MacOS in seconds. If +#' - `request_timeout`: Socket read time on Windows and macOS in seconds. If #' negative, the AWS SDK default (typically 3 seconds). #' - `connect_timeout`: Socket connection timeout in seconds. If negative, AWS #' SDK default is used (typically 1 second). diff --git a/r/R/parquet.R b/r/R/parquet.R index 74f51767a29c4..d92e913cb5db3 100644 --- a/r/R/parquet.R +++ b/r/R/parquet.R @@ -128,7 +128,7 @@ read_parquet <- function(file, #' - A named vector, to specify the value for the named columns, the default #' value for the setting is used when not supplied #' -#' The `compression` argument can be any of the following (case insensitive): +#' The `compression` argument can be any of the following (case-insensitive): #' "uncompressed", "snappy", "gzip", "brotli", "zstd", "lz4", "lzo" or "bz2". #' Only "uncompressed" is guaranteed to be available, but "snappy" and "gzip" #' are almost always included. See [codec_is_available()]. diff --git a/r/R/udf.R b/r/R/udf.R index fe08f02812fd9..922095cceba6a 100644 --- a/r/R/udf.R +++ b/r/R/udf.R @@ -154,7 +154,7 @@ arrow_scalar_function <- function(fun, in_type, out_type, auto_convert = FALSE) sprintf( paste0( "Expected `fun` to accept %d argument(s)\n", - "but found a function that acccepts %d argument(s)\n", + "but found a function that accepts %d argument(s)\n", "Did you forget to include `context` as the first argument?" ), expected_n_args, diff --git a/r/configure b/r/configure index 96238f0b9a37e..029fc004dfc4c 100755 --- a/r/configure +++ b/r/configure @@ -62,7 +62,7 @@ PKG_CONFIG_NAME="arrow" PKG_BREW_NAME="apache-arrow" PKG_TEST_HEADER="" -# Some env vars that control the build (all logical, case insensitive) +# Some env vars that control the build (all logical, case-insensitive) # Development mode, also increases verbosity in the bundled build ARROW_R_DEV=`echo $ARROW_R_DEV | tr '[:upper:]' '[:lower:]'` # The bundled build compiles arrow C++ from source; FORCE ensures we don't pick up diff --git a/r/man/ExtensionType.Rd b/r/man/ExtensionType.Rd index 032a4a76bf80b..aef4d01d7539e 100644 --- a/r/man/ExtensionType.Rd +++ b/r/man/ExtensionType.Rd @@ -26,7 +26,7 @@ extension metadata as a UTF-8 encoded string. with this extension type. } -In addition, subclasses may override the following methos to customize +In addition, subclasses may override the following methods to customize the behaviour of extension classes. \itemize{ \item \verb{$deserialize_instance()}: This method is called when a new \link{ExtensionType} diff --git a/r/man/FileSystem.Rd b/r/man/FileSystem.Rd index b71d95f423ee3..dbf89ef1387ac 100644 --- a/r/man/FileSystem.Rd +++ b/r/man/FileSystem.Rd @@ -57,7 +57,7 @@ in the background, without blocking (default \code{TRUE}) buckets if \verb{$CreateDir()} is called on the bucket level (default \code{FALSE}). \item \code{allow_bucket_deletion}: logical, if TRUE, the filesystem will delete buckets if\verb{$DeleteDir()} is called on the bucket level (default \code{FALSE}). -\item \code{request_timeout}: Socket read time on Windows and MacOS in seconds. If +\item \code{request_timeout}: Socket read time on Windows and macOS in seconds. If negative, the AWS SDK default (typically 3 seconds). \item \code{connect_timeout}: Socket connection timeout in seconds. If negative, AWS SDK default is used (typically 1 second). diff --git a/r/man/add_filename.Rd b/r/man/add_filename.Rd index 93718435a2042..1fe10ea4f8f26 100644 --- a/r/man/add_filename.Rd +++ b/r/man/add_filename.Rd @@ -12,7 +12,7 @@ augmented column. } \description{ This function only exists inside \code{arrow} \code{dplyr} queries, and it only is -valid when quering on a \code{FileSystemDataset}. +valid when querying on a \code{FileSystemDataset}. } \details{ To use filenames generated by this function in subsequent pipeline steps, you diff --git a/r/man/codec_is_available.Rd b/r/man/codec_is_available.Rd index 5cda813f41673..e79b5724b8b17 100644 --- a/r/man/codec_is_available.Rd +++ b/r/man/codec_is_available.Rd @@ -8,7 +8,7 @@ codec_is_available(type) } \arguments{ \item{type}{A string, one of "uncompressed", "snappy", "gzip", "brotli", -"zstd", "lz4", "lzo", or "bz2", case insensitive.} +"zstd", "lz4", "lzo", or "bz2", case-insensitive.} } \value{ Logical: is \code{type} available? diff --git a/r/man/io_thread_count.Rd b/r/man/io_thread_count.Rd index 6cd44e1f6ea94..ae9297bb57761 100644 --- a/r/man/io_thread_count.Rd +++ b/r/man/io_thread_count.Rd @@ -11,7 +11,7 @@ set_io_thread_count(num_threads) } \arguments{ \item{num_threads}{integer: New number of threads for thread pool. At least -two threads are reccomended to support all operations in the arrow +two threads are recommended to support all operations in the arrow package.} } \description{ diff --git a/r/man/new_extension_type.Rd b/r/man/new_extension_type.Rd index 6d0f27c321991..a7307e538b940 100644 --- a/r/man/new_extension_type.Rd +++ b/r/man/new_extension_type.Rd @@ -32,7 +32,7 @@ array.} is reserved for extension types defined by the Apache Arrow libraries.} \item{extension_metadata}{A \code{\link[=raw]{raw()}} or \code{\link[=character]{character()}} vector containing the -serialized version of the type. Chatacter vectors must be length 1 and +serialized version of the type. Character vectors must be length 1 and are converted to UTF-8 before converting to \code{\link[=raw]{raw()}}.} \item{type_class}{An \link[R6:R6Class]{R6::R6Class} whose \verb{$new()} class method will be diff --git a/r/man/open_dataset.Rd b/r/man/open_dataset.Rd index 7c3d32289f73e..7028f38467303 100644 --- a/r/man/open_dataset.Rd +++ b/r/man/open_dataset.Rd @@ -142,7 +142,7 @@ what names to give the virtual columns that come from the path segments. The default behavior in \code{open_dataset()} is to inspect the file paths contained in the provided directory, and if they look like Hive-style, parse -them as Hive. If your dataset has Hive-style partioning in the file paths, +them as Hive. If your dataset has Hive-style partitioning in the file paths, you do not need to provide anything in the \code{partitioning} argument to \code{open_dataset()} to use them. If you do provide a character vector of partition column names, they will be ignored if they match what is detected, diff --git a/r/man/read_delim_arrow.Rd b/r/man/read_delim_arrow.Rd index 999f2d265b7fd..b56d445c9e2e3 100644 --- a/r/man/read_delim_arrow.Rd +++ b/r/man/read_delim_arrow.Rd @@ -230,7 +230,7 @@ be dropped. Note that if you are specifying column names, whether by \code{schema} or \code{col_names}, and the CSV file has a header row that would otherwise be used -to idenfity column names, you'll need to add \code{skip = 1} to skip that row. +to identify column names, you'll need to add \code{skip = 1} to skip that row. } \examples{ diff --git a/r/man/write_feather.Rd b/r/man/write_feather.Rd index 78cf60b67477f..0d3a7da3b90b4 100644 --- a/r/man/write_feather.Rd +++ b/r/man/write_feather.Rd @@ -59,7 +59,7 @@ and to make sharing data across data analysis languages easy. a legacy version available starting in 2016, and the Version 2 (V2), which is the Apache Arrow IPC file format. The default version is V2. -V1 files are distinct from Arrow IPC files and lack many feathures, +V1 files are distinct from Arrow IPC files and lack many features, such as the ability to store all Arrow data tyeps, and compression support. \code{\link[=write_ipc_file]{write_ipc_file()}} can only write V2 files. } diff --git a/r/man/write_parquet.Rd b/r/man/write_parquet.Rd index af976b1aabf81..480abb12fcf4a 100644 --- a/r/man/write_parquet.Rd +++ b/r/man/write_parquet.Rd @@ -86,7 +86,7 @@ value for each column, in positional order value for the setting is used when not supplied } -The \code{compression} argument can be any of the following (case insensitive): +The \code{compression} argument can be any of the following (case-insensitive): "uncompressed", "snappy", "gzip", "brotli", "zstd", "lz4", "lzo" or "bz2". Only "uncompressed" is guaranteed to be available, but "snappy" and "gzip" are almost always included. See \code{\link[=codec_is_available]{codec_is_available()}}. diff --git a/r/src/altrep.cpp b/r/src/altrep.cpp index 9bacf07d1840e..9745393d01bbc 100644 --- a/r/src/altrep.cpp +++ b/r/src/altrep.cpp @@ -747,7 +747,7 @@ struct AltrepVectorString : public AltrepVectorBase> { // Helper class to convert to R strings. We declare one of these for the // class to avoid having to stack-allocate one for every STRING_ELT call. // This class does not own a reference to any arrays: it is the caller's - // responsibility to ensure the Array lifetime exeeds that of the viewer. + // responsibility to ensure the Array lifetime exceeds that of the viewer. struct RStringViewer { RStringViewer() : strip_out_nuls_(false), nul_was_stripped_(false) {} diff --git a/r/src/safe-call-into-r.h b/r/src/safe-call-into-r.h index 319d46d11f0d6..0ffd1d16dca01 100644 --- a/r/src/safe-call-into-r.h +++ b/r/src/safe-call-into-r.h @@ -141,15 +141,15 @@ class MainRThread { MainRThread() : initialized_(false), executor_(nullptr), stop_source_(nullptr) {} }; -// This object is used to ensure that signal hanlders are registered when +// This object is used to ensure that signal handlers are registered when // RunWithCapturedR launches its background thread to call Arrow and is // cleaned up however this exits. Note that the lifecycle of the StopSource, // which is registered at package load, is not necessarily tied to the // lifecycle of the signal handlers. The general approach is to register // the signal handlers only when we are evaluating code outside the R thread // (when we are evaluating code *on* the R thread, R's signal handlers are -// sufficient and will signal an interupt condition that will propagate -// via a cpp11::unwind_excpetion). +// sufficient and will signal an interrupt condition that will propagate +// via a cpp11::unwind_exception). class WithSignalHandlerContext { public: WithSignalHandlerContext() : signal_handler_registered_(false) { diff --git a/r/tests/testthat/helper-arrow.R b/r/tests/testthat/helper-arrow.R index 8d39f7252ee21..e277c645d456e 100644 --- a/r/tests/testthat/helper-arrow.R +++ b/r/tests/testthat/helper-arrow.R @@ -37,7 +37,7 @@ with_language <- function(lang, expr) { skip_on_cran() old <- Sys.getenv("LANGUAGE") # Check what this message is before changing languages; this will - # trigger caching the transations if the OS does that (some do). + # trigger caching the translations if the OS does that (some do). # If the OS does cache, then we can't test changing languages safely. before <- i18ize_error_messages() Sys.setenv(LANGUAGE = lang) diff --git a/r/tests/testthat/helper-skip.R b/r/tests/testthat/helper-skip.R index 3d68dac5af69b..bd29080848184 100644 --- a/r/tests/testthat/helper-skip.R +++ b/r/tests/testthat/helper-skip.R @@ -38,11 +38,11 @@ skip_if_not_available <- function(feature) { skip_on_linux_devel() } - # curl/ssl on MacOS is too old to support S3 filesystems without + # curl/ssl on macOS is too old to support S3 filesystems without # crashing when the process exits. if (feature == "s3") { if (on_macos_10_13_or_lower()) { - skip("curl/ssl runtime on MacOS 10.13 is too old") + skip("curl/ssl runtime on macOS 10.13 is too old") } } diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index b29c1f4e09dde..bb005605de318 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -371,19 +371,19 @@ test_that("support for NaN (ARROW-3615)", { expect_equal(y$null_count, 1L) }) -test_that("is.nan() evalutes to FALSE on NA (for consistency with base R)", { +test_that("is.nan() evaluates to FALSE on NA (for consistency with base R)", { x <- c(1.0, NA, NaN, -1.0) compare_expression(is.nan(.input), x) }) -test_that("is.nan() evalutes to FALSE on non-floats (for consistency with base R)", { +test_that("is.nan() evaluates to FALSE on non-floats (for consistency with base R)", { x <- c(1L, 2L, 3L) y <- c("foo", "bar") compare_expression(is.nan(.input), x) compare_expression(is.nan(.input), y) }) -test_that("is.na() evalutes to TRUE on NaN (for consistency with base R)", { +test_that("is.na() evaluates to TRUE on NaN (for consistency with base R)", { x <- c(1, NA, NaN, -1) compare_expression(is.na(.input), x) }) diff --git a/r/tests/testthat/test-backwards-compatibility.R b/r/tests/testthat/test-backwards-compatibility.R index 8210bd2e78fd8..5f804b02dcee7 100644 --- a/r/tests/testthat/test-backwards-compatibility.R +++ b/r/tests/testthat/test-backwards-compatibility.R @@ -22,7 +22,7 @@ # To write a new version of a test file for an old version, use docker(-compose) # to setup a linux distribution and use RStudio's public package manager binary # repo to install the old version. The following commands should be run at the -# root of the arrow repo directory and might need slight adjusments. +# root of the arrow repo directory and might need slight adjustments. # R_ORG=rstudio R_IMAGE=r-base R_TAG=4.0-focal docker-compose build --no-cache r # R_ORG=rstudio R_IMAGE=r-base R_TAG=4.0-focal docker-compose run r /bin/bash # R diff --git a/r/tests/testthat/test-dataset-write.R b/r/tests/testthat/test-dataset-write.R index 28ff308747584..9f69380c55b3b 100644 --- a/r/tests/testthat/test-dataset-write.R +++ b/r/tests/testthat/test-dataset-write.R @@ -139,7 +139,7 @@ test_that("Writing a dataset: Parquet->Parquet (default)", { ) }) -test_that("Writing a dataset: `basename_template` default behavier", { +test_that("Writing a dataset: `basename_template` default behavior", { ds <- open_dataset(csv_dir, partitioning = "part", format = "csv") dst_dir <- make_temp_dir() @@ -840,7 +840,7 @@ test_that("Writing a dataset to text files with wrapper functions.", { expect_equal(new_ds %>% collect(), df) }) -test_that("Writing a flat file dataset: `basename_template` default behavier", { +test_that("Writing a flat file dataset: `basename_template` default behavior", { ds <- open_dataset(csv_dir, partitioning = "part", format = "csv") dst_dir <- make_temp_dir() diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index e707a194a3626..4d3226798d3ff 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1550,7 +1550,7 @@ test_that("as.difftime()", { ) # only integer (or integer-like) -> duration conversion supported in Arrow. - # double -> duration not supported. we're not testing the content of the + # double -> duration not supported. We aren't testing the content of the # error message as it is being generated in the C++ code and it might change, # but we want to make sure that this error is raised in our binding implementation expect_error( @@ -1961,7 +1961,7 @@ test_that("`as.Date()` and `as_date()`", { # `as.Date()` ignores the `tzone` attribute and uses the value of the `tz` arg # to `as.Date()` # `as_date()` does the opposite: uses the tzone attribute of the POSIXct object - # passsed if`tz` is NULL + # passed if`tz` is NULL compare_dplyr_binding( .input %>% transmute( @@ -2831,7 +2831,7 @@ test_that("parse_date_time with truncated formats", { }) test_that("parse_date_time with `locale != NULL` not supported", { - # parse_date_time currently doesn't take locale paramete which will be + # parse_date_time currently doesn't take locale parameter which will be # addressed in https://issues.apache.org/jira/browse/ARROW-17147 skip_if_not_available("re2") @@ -3038,7 +3038,7 @@ test_that("build_formats() and build_format_from_order()", { # an "easy" date to avoid conflating tests of different things (i.e., it's # UTC time, and not one of the edge cases on or extremely close to the -# rounding boundaty) +# rounding boundary) easy_date <- as.POSIXct("2022-10-11 12:00:00", tz = "UTC") easy_df <- tibble::tibble(datetime = easy_date) @@ -3703,7 +3703,7 @@ test_that("with_tz() and force_tz() works", { roll_dst = "post") ) %>% collect(), - "roll_dst` value must be 'error' or 'boundary' for non-existent times" + "roll_dst` value must be 'error' or 'boundary' for nonexistent times" ) expect_warning( @@ -3716,7 +3716,7 @@ test_that("with_tz() and force_tz() works", { ) ) %>% collect(), - "`roll_dst` value must be 'error', 'pre', or 'post' for non-existent times" + "`roll_dst` value must be 'error', 'pre', or 'post' for nonexistent times" ) # Raise error when the timezone falls into the DST-break diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R index d39c800f3ff0c..b2b2a9e54695d 100644 --- a/r/tests/testthat/test-dplyr-summarize.R +++ b/r/tests/testthat/test-dplyr-summarize.R @@ -355,7 +355,7 @@ test_that("Functions that take ... but we only accept a single arg", { test_that("median()", { # When medians are integer-valued, stats::median() sometimes returns output of - # type integer, whereas whereas the Arrow approx_median kernels always return + # type integer, whereas the Arrow approx_median kernels always return # output of type float64. The calls to median(int, ...) in the tests below # are enclosed in as.double() to work around this known difference. @@ -434,7 +434,7 @@ test_that("quantile()", { # returned by Arrow. # When quantiles are integer-valued, stats::quantile() sometimes returns - # output of type integer, whereas whereas the Arrow tdigest kernels always + # output of type integer, whereas the Arrow tdigest kernels always # return output of type float64. The calls to quantile(int, ...) in the tests # below are enclosed in as.double() to work around this known difference. @@ -841,7 +841,7 @@ test_that("Expressions on aggregations", { ) ) - # Check aggregates on aggeregates with more complex calls + # Check aggregates on aggregates with more complex calls expect_warning( record_batch(tbl) %>% summarise(any(any(!lgl))), paste( diff --git a/r/tests/testthat/test-extension.R b/r/tests/testthat/test-extension.R index 55a1f8d21eedb..8b3d7d8aaa902 100644 --- a/r/tests/testthat/test-extension.R +++ b/r/tests/testthat/test-extension.R @@ -256,7 +256,7 @@ test_that("RecordBatch can roundtrip extension types", { ) # check both column orders, since column order should stay in the same - # order whether the colunns are are extension types or not + # order whether the columns are extension types or not mixed_record_batch2 <- record_batch( normal = normal_vctr, custom = custom_array @@ -296,7 +296,7 @@ test_that("Table can roundtrip extension types", { ) # check both column orders, since column order should stay in the same - # order whether the colunns are are extension types or not + # order whether the columns are extension types or not mixed_table2 <- arrow_table( normal = normal_vctr, custom = custom_array diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index f4ae7312d3757..1794acee70d22 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -#### Fuctions #### check end of file for main logic +#### Functions #### check end of file for main logic env_is <- function(var, value) identical(tolower(Sys.getenv(var)), value) # Log messages in the style of the configure script @@ -896,7 +896,7 @@ download_libarrow_ok <- download_ok && !env_is("LIBARROW_DOWNLOAD", "false") thirdparty_dependency_dir <- Sys.getenv("ARROW_THIRDPARTY_DEPENDENCY_DIR", "tools/thirdparty_dependencies") arrow_versioned <- paste0("arrow-", VERSION) -# configure.win uses a different libarrow dir and and the zip is already nested +# configure.win uses a different libarrow dir and the zip is already nested if (on_windows) { lib_dir <- "windows" dst_dir <- lib_dir diff --git a/r/tools/update-checksums.R b/r/tools/update-checksums.R index 8b9f1e6959cfd..f41652e87849e 100644 --- a/r/tools/update-checksums.R +++ b/r/tools/update-checksums.R @@ -49,7 +49,7 @@ binary_paths <- readLines(tasks_yml) |> artifactory_root <- "https://apache.jfrog.io/artifactory/arrow/r/%s/libarrow/bin/%s" -# Get the checksuym file from the artifactory +# Get the checksum file from the artifactory for (path in binary_paths) { sha_path <- paste0(path, ".sha512") file <- file.path("tools/checksums", sha_path) diff --git a/r/vignettes/arrow.Rmd b/r/vignettes/arrow.Rmd index c218b08ede77b..50329334ce8b0 100644 --- a/r/vignettes/arrow.Rmd +++ b/r/vignettes/arrow.Rmd @@ -66,7 +66,7 @@ as.data.frame(dat) When this coercion takes place, each of the columns in the original Arrow Table must be converted to native R data objects. In the `dat` Table, for instance, `dat$x` is stored as the Arrow data type int32 inherited from C++, which becomes an R integer type when `as.data.frame()` is called. -It is possible to exercise fine grained control over this conversion process. To learn more about the different types and how they are converted, see the [data types](./data_types.html) article. +It is possible to exercise fine-grained control over this conversion process. To learn more about the different types and how they are converted, see the [data types](./data_types.html) article. ## Reading and writing data diff --git a/r/vignettes/data_objects.Rmd b/r/vignettes/data_objects.Rmd index 7fcef8e6e78c6..065745182df04 100644 --- a/r/vignettes/data_objects.Rmd +++ b/r/vignettes/data_objects.Rmd @@ -259,7 +259,7 @@ write_parquet(df_b, file.path(ds_dir_b, "part-0.parquet")) write_parquet(df_c, file.path(ds_dir_c, "part-0.parquet")) ``` -If we had wanted to, we could have further subdivided the dataset. A folder could contain multiple files (`part-0.parquet`, `part-1.parquet`, etc) if we wanted it to. Similarly, there is no particular reason to name the files `part-0.parquet` this way at all: it would have been fine to call these files `subset-a.parquet`, `subset-b.parquet`, and `subset-c.parquet` if we had wished. We could have written other file formats if we wanted, and we don't necessarily have to use Hive-style folders. You can learn more about the supported formats by reading the help documentation for `open_dataset()`, and learn about how to exercise fine grained control with `help("Dataset", package = "arrow")`. +If we had wanted to, we could have further subdivided the dataset. A folder could contain multiple files (`part-0.parquet`, `part-1.parquet`, etc) if we wanted it to. Similarly, there is no particular reason to name the files `part-0.parquet` this way at all: it would have been fine to call these files `subset-a.parquet`, `subset-b.parquet`, and `subset-c.parquet` if we had wished. We could have written other file formats if we wanted, and we don't necessarily have to use Hive-style folders. You can learn more about the supported formats by reading the help documentation for `open_dataset()`, and learn about how to exercise fine-grained control with `help("Dataset", package = "arrow")`. In any case, we have created an on-disk parquet Dataset using Hive-style partitioning. Our Dataset is defined by these files: diff --git a/r/vignettes/data_types.Rmd b/r/vignettes/data_types.Rmd index 6cbe7c72e6809..4b5ee01b6ab83 100644 --- a/r/vignettes/data_types.Rmd +++ b/r/vignettes/data_types.Rmd @@ -34,7 +34,7 @@ When the arrow package converts between R data and Arrow data, it will first che knitr::include_graphics("./data_types.png") ``` -In this image, black boxes refer to R data types and light blue boxes refer to Arrow data types. Directional arrows specify conversions (e.g., the bidirectional arrow between the logical R type and the boolean Arrow type means that R logicals convert to Arrow booleans and vice versa). Solid lines indicate that the this conversion rule is always the default; dashed lines mean that it only sometimes applies (the rules and special cases are described below). +In this image, black boxes refer to R data types and light blue boxes refer to Arrow data types. Directional arrows specify conversions (e.g., the bidirectional arrow between the logical R type and the boolean Arrow type means that the logical R converts to an Arrow boolean and vice versa). Solid lines indicate that this conversion rule is always the default; dashed lines mean that it only sometimes applies (the rules and special cases are described below). ## Logical/boolean types diff --git a/r/vignettes/data_wrangling.Rmd b/r/vignettes/data_wrangling.Rmd index e3d5b306f3e71..305a91c156eb1 100644 --- a/r/vignettes/data_wrangling.Rmd +++ b/r/vignettes/data_wrangling.Rmd @@ -165,7 +165,7 @@ sw2 %>% transmute(name, height, mass, res = residuals(lm(mass ~ height))) ``` -Because window functions are not supported, computing an aggregation like `mean()` on a grouped table or within a rowwise opertation like `filter()` is not supported: +Because window functions are not supported, computing an aggregation like `mean()` on a grouped table or within a rowwise operation like `filter()` is not supported: ```{r} sw %>% diff --git a/r/vignettes/developers/setup.Rmd b/r/vignettes/developers/setup.Rmd index de33e72407792..8e7cff7410473 100644 --- a/r/vignettes/developers/setup.Rmd +++ b/r/vignettes/developers/setup.Rmd @@ -46,18 +46,18 @@ not possible to link to a system version of libarrow during development). ## Option 1: Using nightly libarrow binaries -On Linux, MacOS, and Windows you can use the same workflow you might use for another +On Linux, macOS, and Windows you can use the same workflow you might use for another package that contains compiled code (e.g., `R CMD INSTALL .` from a terminal, `devtools::load_all()` from an R prompt, or `Install & Restart` from RStudio). If the `arrow/r/libarrow` directory is not populated, the configure script will attempt to download the latest nightly libarrow binary, extract it to the -`arrow/r/libarrow` directory (MacOS, Linux) or `arrow/r/windows` +`arrow/r/libarrow` directory (macOS, Linux) or `arrow/r/windows` directory (Windows), and continue building the R package as usual. Most of the time, you won't need to update your version of libarrow because the R package rarely changes with updates to the C++ library; however, if you start to get errors when rebuilding the R package, you may have to remove the -`libarrow` directory (MacOS, Linux) or `windows` directory (Windows) +`libarrow` directory (macOS, Linux) or `windows` directory (Windows) and do a "clean" rebuild. You can do this from a terminal with `R CMD INSTALL . --preclean`, from RStudio using the "Clean and Install" option from "Build" tab, or using `make clean` if you are using the `Makefile` diff --git a/r/vignettes/fs.Rmd b/r/vignettes/fs.Rmd index a21a7864f7d73..50278af25bd1b 100644 --- a/r/vignettes/fs.Rmd +++ b/r/vignettes/fs.Rmd @@ -14,7 +14,7 @@ This article provides an overview of working with both S3 and GCS data using the ## S3 and GCS support on Linux -Before you start, make sure that your arrow install has support for S3 and/or GCS enabled. For most users this will be true by default, because the Windows and MacOS binary packages hosted on CRAN include S3 and GCS support. You can check whether support is enabled via helper functions: +Before you start, make sure that your arrow install has support for S3 and/or GCS enabled. For most users this will be true by default, because the Windows and macOS binary packages hosted on CRAN include S3 and GCS support. You can check whether support is enabled via helper functions: ```r arrow_with_s3() @@ -307,7 +307,7 @@ Sys.unsetenv("AWS_S3_ENDPOINT") ``` By default, the AWS SDK tries to retrieve metadata about user configuration, -which can cause conficts when passing in connection details via URI (for example +which can cause conflicts when passing in connection details via URI (for example when accessing a MINIO bucket). To disable the use of AWS environment variables, you can set environment variable `AWS_EC2_METADATA_DISABLED` to `TRUE`. diff --git a/r/vignettes/install.Rmd b/r/vignettes/install.Rmd index 10155e3a8cd5b..df43a9de36fc2 100644 --- a/r/vignettes/install.Rmd +++ b/r/vignettes/install.Rmd @@ -10,9 +10,9 @@ In most cases, `install.packages("arrow")` should just work. There are things yo ## Background -The Apache Arrow project is implemented in multiple languages, and the R package depends on the Arrow C++ library (referred to from here on as libarrow). This means that when you install arrow, you need both the R and C++ versions. If you install arrow from CRAN on a machine running Windows or MacOS, when you call `install.packages("arrow")`, a precompiled binary containing both the R package and libarrow will be downloaded. However, CRAN does not host R package binaries for Linux, and so you must choose from one of the alternative approaches. +The Apache Arrow project is implemented in multiple languages, and the R package depends on the Arrow C++ library (referred to from here on as libarrow). This means that when you install arrow, you need both the R and C++ versions. If you install arrow from CRAN on a machine running Windows or macOS, when you call `install.packages("arrow")`, a precompiled binary containing both the R package and libarrow will be downloaded. However, CRAN does not host R package binaries for Linux, and so you must choose from one of the alternative approaches. -This article outlines the recommend approaches to installing arrow on Linux, starting from the simplest and least customizable to the most complex but with more flexbility to customize your installation. +This article outlines the recommend approaches to installing arrow on Linux, starting from the simplest and least customizable to the most complex but with more flexibility to customize your installation. The primary audience for this document is arrow R package _users_ on Linux, and not Arrow _developers_. Additional resources for developers are listed at the end of this article. @@ -225,7 +225,7 @@ already present (when set to `AUTO`, the default). These dependencies vary by platform; however, if you wish to install these yourself prior to libarrow installation, we recommend that you take a look at the [docker file for whichever of our CI builds](https://github.com/apache/arrow/tree/main/ci/docker) -(the ones ending in "cpp" are for building Arrow's C++ libaries, aka libarrow) +(the ones ending in "cpp" are for building Arrow's C++ libraries, aka libarrow) corresponds most closely to your setup. This will contain the most up-to-date information about dependencies and minimum versions. diff --git a/r/vignettes/read_write.Rmd b/r/vignettes/read_write.Rmd index 15b2392b8ee5c..0ee695a6f4907 100644 --- a/r/vignettes/read_write.Rmd +++ b/r/vignettes/read_write.Rmd @@ -140,7 +140,7 @@ write_csv_arrow(mtcars, file_path) read_csv_arrow(file_path, col_select = starts_with("d")) ``` -In addition to the options provided by the readr-style arguments (`delim`, `quote`, `escape_doubple`, `escape_backslash`, etc), you can use the `schema` argument to specify column types: see `schema()` help for details. There is also the option of using `parse_options`, `convert_options`, and `read_options` to exercise fine-grained control over the arrow csv reader: see `help("CsvReadOptions", package = "arrow")` for details. +In addition to the options provided by the readr-style arguments (`delim`, `quote`, `escape_double`, `escape_backslash`, etc), you can use the `schema` argument to specify column types: see `schema()` help for details. There is also the option of using `parse_options`, `convert_options`, and `read_options` to exercise fine-grained control over the arrow csv reader: see `help("CsvReadOptions", package = "arrow")` for details. ## JSON format From 081b4022fe6f659d8765efc82b3f4787c5039e3c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 7 Dec 2023 13:10:52 -0500 Subject: [PATCH 004/570] MINOR: [Java] Bump ch.qos.logback:logback-classic from 1.2.3 to 1.2.13 in /java (#39085) Bumps [ch.qos.logback:logback-classic](https://github.com/qos-ch/logback) from 1.2.3 to 1.2.13.
Commits
  • 2648b9e prepare release 1.2.13
  • bb09515 fix CVE-2023-6378
  • 4573294 start work on 1.2.13-SNAPSHOT
  • a388193 Merge branch 'branch_1.2.x' of github.com:qos-ch/logback into branch_1.2.x
  • de44dc4 prepare release 1.2.12
  • ca0cf17 Merge pull request #532 from joakime/fix-jetty-requestlog
  • e31609b removed unused files
  • 21e29ef Merge pull request #567 from spliffone/LOGBACK-1633
  • e869000 fix: published POM file contain the wrong scm URL
  • 009ea46 version for next dev cycle
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=ch.qos.logback:logback-classic&package-manager=maven&previous-version=1.2.3&new-version=1.2.13)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself) You can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/apache/arrow/network/alerts).
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: David Li --- java/pom.xml | 2 +- java/tools/pom.xml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/java/pom.xml b/java/pom.xml index 4d561dba87a2d..cd16b862d10c0 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -698,7 +698,7 @@ ch.qos.logback logback-classic - 1.2.3 + 1.2.13 test diff --git a/java/tools/pom.xml b/java/tools/pom.xml index 4b43c513efd36..1815c39227de9 100644 --- a/java/tools/pom.xml +++ b/java/tools/pom.xml @@ -47,7 +47,7 @@ ch.qos.logback logback-classic - 1.2.3 + 1.2.13 runtime From 476c78fd6e535faacfc6a171529ef496abb30cd9 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Fri, 8 Dec 2023 15:48:06 -0300 Subject: [PATCH 005/570] GH-38597: [C++] Implement GetFileInfo(selector) for Azure filesystem (#39009) ### Rationale for this change Part of Azure FS implementation. ### What changes are included in this PR? The version of `GetFileInfo` that takes a prefix and can optionally recurse into directories. ### Are these changes tested? By unit tests present in this PR. Separate from this PR, I'm thinking of way to fuzz-test the FS API. * Closes: #38597 --- cpp/src/arrow/filesystem/azurefs.cc | 212 ++++++++++++++++++- cpp/src/arrow/filesystem/azurefs.h | 2 +- cpp/src/arrow/filesystem/azurefs_test.cc | 248 ++++++++++++++++++++++- cpp/src/arrow/filesystem/filesystem.cc | 3 +- cpp/src/arrow/filesystem/path_util.cc | 31 ++- cpp/src/arrow/filesystem/path_util.h | 12 +- cpp/src/arrow/filesystem/test_util.cc | 6 + cpp/src/arrow/filesystem/test_util.h | 4 + 8 files changed, 481 insertions(+), 37 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 9bd2b0ae9d8a0..daababb04c172 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -39,7 +39,7 @@ namespace fs { // ----------------------------------------------------------------------- // AzureOptions Implementation -AzureOptions::AzureOptions() {} +AzureOptions::AzureOptions() = default; bool AzureOptions::Equals(const AzureOptions& other) const { return (account_dfs_url == other.account_dfs_url && @@ -820,6 +820,209 @@ class AzureFileSystem::Impl { } } + private: + template + Status VisitContainers(const Azure::Core::Context& context, + OnContainer&& on_container) const { + Azure::Storage::Blobs::ListBlobContainersOptions options; + try { + auto container_list_response = + blob_service_client_->ListBlobContainers(options, context); + for (; container_list_response.HasPage(); + container_list_response.MoveToNextPage(context)) { + for (const auto& container : container_list_response.BlobContainers) { + RETURN_NOT_OK(on_container(container)); + } + } + } catch (const Azure::Storage::StorageException& exception) { + return internal::ExceptionToStatus("Failed to list account containers.", exception); + } + return Status::OK(); + } + + static FileInfo FileInfoFromBlob(const std::string& container, + const Azure::Storage::Blobs::Models::BlobItem& blob) { + auto path = internal::ConcatAbstractPath(container, blob.Name); + if (internal::HasTrailingSlash(blob.Name)) { + return DirectoryFileInfoFromPath(path); + } + FileInfo info{std::move(path), FileType::File}; + info.set_size(blob.BlobSize); + info.set_mtime(std::chrono::system_clock::time_point{blob.Details.LastModified}); + return info; + } + + static FileInfo DirectoryFileInfoFromPath(const std::string& path) { + return FileInfo{std::string{internal::RemoveTrailingSlash(path)}, + FileType::Directory}; + } + + static std::string_view BasenameView(std::string_view s) { + DCHECK(!internal::HasTrailingSlash(s)); + auto offset = s.find_last_of(internal::kSep); + auto result = (offset == std::string_view::npos) ? s : s.substr(offset); + DCHECK(!result.empty() && result.back() != internal::kSep); + return result; + } + + /// \brief List the blobs at the root of a container or some dir in a container. + /// + /// \pre container_client is the client for the container named like the first + /// segment of select.base_dir. + Status GetFileInfoWithSelectorFromContainer( + const Azure::Storage::Blobs::BlobContainerClient& container_client, + const Azure::Core::Context& context, Azure::Nullable page_size_hint, + const FileSelector& select, FileInfoVector* acc_results) { + ARROW_ASSIGN_OR_RAISE(auto base_location, AzureLocation::FromString(select.base_dir)); + + bool found = false; + Azure::Storage::Blobs::ListBlobsOptions options; + if (internal::IsEmptyPath(base_location.path)) { + // If the base_dir is the root of the container, then we want to list all blobs in + // the container and the Prefix should be empty and not even include the trailing + // slash because the container itself represents the `/` directory. + options.Prefix = {}; + found = true; // Unless the container itself is not found later! + } else { + options.Prefix = internal::EnsureTrailingSlash(base_location.path); + } + options.PageSizeHint = page_size_hint; + options.Include = Azure::Storage::Blobs::Models::ListBlobsIncludeFlags::Metadata; + + auto recurse = [&](const std::string& blob_prefix) noexcept -> Status { + if (select.recursive && select.max_recursion > 0) { + FileSelector sub_select; + sub_select.base_dir = internal::ConcatAbstractPath( + base_location.container, internal::RemoveTrailingSlash(blob_prefix)); + sub_select.allow_not_found = true; + sub_select.recursive = true; + sub_select.max_recursion = select.max_recursion - 1; + return GetFileInfoWithSelectorFromContainer( + container_client, context, page_size_hint, sub_select, acc_results); + } + return Status::OK(); + }; + + auto process_blob = + [&](const Azure::Storage::Blobs::Models::BlobItem& blob) noexcept { + // blob.Name has trailing slash only when Prefix is an empty + // directory marker blob for the directory we're listing + // from, and we should skip it. + if (!internal::HasTrailingSlash(blob.Name)) { + acc_results->push_back(FileInfoFromBlob(base_location.container, blob)); + } + }; + auto process_prefix = [&](const std::string& prefix) noexcept -> Status { + const auto path = internal::ConcatAbstractPath(base_location.container, prefix); + acc_results->push_back(DirectoryFileInfoFromPath(path)); + return recurse(prefix); + }; + + try { + auto list_response = + container_client.ListBlobsByHierarchy(/*delimiter=*/"/", options, context); + for (; list_response.HasPage(); list_response.MoveToNextPage(context)) { + if (list_response.Blobs.empty() && list_response.BlobPrefixes.empty()) { + continue; + } + found = true; + // Blob and BlobPrefixes are sorted by name, so we can merge-iterate + // them to ensure returned results are all sorted. + size_t blob_index = 0; + size_t blob_prefix_index = 0; + while (blob_index < list_response.Blobs.size() && + blob_prefix_index < list_response.BlobPrefixes.size()) { + const auto& blob = list_response.Blobs[blob_index]; + const auto& prefix = list_response.BlobPrefixes[blob_prefix_index]; + const int cmp = blob.Name.compare(prefix); + if (cmp < 0) { + process_blob(blob); + blob_index += 1; + } else if (cmp > 0) { + RETURN_NOT_OK(process_prefix(prefix)); + blob_prefix_index += 1; + } else { + DCHECK_EQ(blob.Name, prefix); + RETURN_NOT_OK(process_prefix(prefix)); + blob_index += 1; + blob_prefix_index += 1; + // If the container has an empty dir marker blob and another blob starting + // with this blob name as a prefix, the blob doesn't appear in the listing + // that also contains the prefix, so AFAICT this branch in unreachable. The + // code above is kept just in case, but if this DCHECK(false) is ever reached, + // we should refactor this loop to ensure no duplicate entries are ever + // reported. + DCHECK(false) + << "Unexpected blob/prefix name collision on the same listing request"; + } + } + for (; blob_index < list_response.Blobs.size(); blob_index++) { + process_blob(list_response.Blobs[blob_index]); + } + for (; blob_prefix_index < list_response.BlobPrefixes.size(); + blob_prefix_index++) { + RETURN_NOT_OK(process_prefix(list_response.BlobPrefixes[blob_prefix_index])); + } + } + } catch (const Azure::Storage::StorageException& exception) { + if (exception.ErrorCode == "ContainerNotFound") { + found = false; + } else { + return internal::ExceptionToStatus( + "Failed to list blobs in a directory: " + select.base_dir + ": " + + container_client.GetUrl(), + exception); + } + } + + return found || select.allow_not_found + ? Status::OK() + : ::arrow::fs::internal::PathNotFound(select.base_dir); + } + + public: + Status GetFileInfoWithSelector(const Azure::Core::Context& context, + Azure::Nullable page_size_hint, + const FileSelector& select, + FileInfoVector* acc_results) { + ARROW_ASSIGN_OR_RAISE(auto base_location, AzureLocation::FromString(select.base_dir)); + + if (base_location.container.empty()) { + // Without a container, the base_location is equivalent to the filesystem + // root -- `/`. FileSelector::allow_not_found doesn't matter in this case + // because the root always exists. + auto on_container = + [&](const Azure::Storage::Blobs::Models::BlobContainerItem& container) { + // Deleted containers are not listed by ListContainers. + DCHECK(!container.IsDeleted); + + // Every container is considered a directory. + FileInfo info{container.Name, FileType::Directory}; + info.set_mtime( + std::chrono::system_clock::time_point{container.Details.LastModified}); + acc_results->push_back(std::move(info)); + + // Recurse into containers (subdirectories) if requested. + if (select.recursive && select.max_recursion > 0) { + FileSelector sub_select; + sub_select.base_dir = container.Name; + sub_select.allow_not_found = true; + sub_select.recursive = true; + sub_select.max_recursion = select.max_recursion - 1; + ARROW_RETURN_NOT_OK(GetFileInfoWithSelector(context, page_size_hint, + sub_select, acc_results)); + } + return Status::OK(); + }; + return VisitContainers(context, std::move(on_container)); + } + + auto container_client = + blob_service_client_->GetBlobContainerClient(base_location.container); + return GetFileInfoWithSelectorFromContainer(container_client, context, page_size_hint, + select, acc_results); + } + Result> OpenInputFile(const AzureLocation& location, AzureFileSystem* fs) { RETURN_NOT_OK(ValidateFileLocation(location)); @@ -1196,7 +1399,12 @@ Result AzureFileSystem::GetFileInfo(const std::string& path) { } Result AzureFileSystem::GetFileInfo(const FileSelector& select) { - return Status::NotImplemented("The Azure FileSystem is not fully implemented"); + Azure::Core::Context context; + Azure::Nullable page_size_hint; // unspecified + FileInfoVector results; + RETURN_NOT_OK( + impl_->GetFileInfoWithSelector(context, page_size_hint, select, &results)); + return {std::move(results)}; } Status AzureFileSystem::CreateDir(const std::string& path, bool recursive) { diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h index 9f980ee8baae0..b2865b059ef6e 100644 --- a/cpp/src/arrow/filesystem/azurefs.h +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -157,7 +157,7 @@ class ARROW_EXPORT AzureFileSystem : public FileSystem { const AzureOptions& options, const io::IOContext& = io::default_io_context()); private: - explicit AzureFileSystem(const AzureOptions& options, const io::IOContext& io_context); + AzureFileSystem(const AzureOptions& options, const io::IOContext& io_context); class Impl; std::unique_ptr impl_; diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 41f1663114f45..792c63b209402 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -70,6 +70,9 @@ using ::testing::IsEmpty; using ::testing::Not; using ::testing::NotNull; +namespace Blobs = Azure::Storage::Blobs; +namespace Files = Azure::Storage::Files; + auto const* kLoremIpsum = R"""( Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis @@ -193,9 +196,8 @@ TEST(AzureFileSystem, OptionsCompare) { class AzureFileSystemTest : public ::testing::Test { public: std::shared_ptr fs_; - std::unique_ptr blob_service_client_; - std::unique_ptr - datalake_service_client_; + std::unique_ptr blob_service_client_; + std::unique_ptr datalake_service_client_; AzureOptions options_; std::mt19937_64 generator_; std::string container_name_; @@ -213,15 +215,14 @@ class AzureFileSystemTest : public ::testing::Test { suite_skipped_ = true; GTEST_SKIP() << options.status().message(); } - container_name_ = RandomChars(32); - blob_service_client_ = std::make_unique( + // Stop-gap solution before GH-39119 is fixed. + container_name_ = "z" + RandomChars(31); + blob_service_client_ = std::make_unique( options_.account_blob_url, options_.storage_credentials_provider); - datalake_service_client_ = - std::make_unique( - options_.account_dfs_url, options_.storage_credentials_provider); + datalake_service_client_ = std::make_unique( + options_.account_dfs_url, options_.storage_credentials_provider); ASSERT_OK_AND_ASSIGN(fs_, AzureFileSystem::Make(options_)); - auto container_client = blob_service_client_->GetBlobContainerClient(container_name_); - container_client.CreateIfNotExists(); + auto container_client = CreateContainer(container_name_); auto blob_client = container_client.GetBlockBlobClient(PreexistingObjectName()); blob_client.UploadFrom(reinterpret_cast(kLoremIpsum), @@ -239,6 +240,20 @@ class AzureFileSystemTest : public ::testing::Test { } } + Blobs::BlobContainerClient CreateContainer(const std::string& name) { + auto container_client = blob_service_client_->GetBlobContainerClient(name); + (void)container_client.CreateIfNotExists(); + return container_client; + } + + Blobs::BlobClient CreateBlob(Blobs::BlobContainerClient& container_client, + const std::string& name, const std::string& data = "") { + auto blob_client = container_client.GetBlockBlobClient(name); + (void)blob_client.UploadFrom(reinterpret_cast(data.data()), + data.size()); + return blob_client; + } + std::string PreexistingContainerName() const { return container_name_; } std::string PreexistingContainerPath() const { @@ -326,6 +341,45 @@ class AzureFileSystemTest : public ::testing::Test { top_blob_path, }; } + + char const* kSubData = "sub data"; + char const* kSomeData = "some data"; + char const* kOtherData = "other data"; + + void SetUpSmallFileSystemTree() { + // Set up test containers + CreateContainer("empty-container"); + auto container = CreateContainer("container"); + + CreateBlob(container, "emptydir/"); + CreateBlob(container, "somedir/subdir/subfile", kSubData); + CreateBlob(container, "somefile", kSomeData); + // Add an explicit marker for a non-empty directory. + CreateBlob(container, "otherdir/1/2/"); + // otherdir/{1/,2/,3/} are implicitly assumed to exist because of + // the otherdir/1/2/3/otherfile blob. + CreateBlob(container, "otherdir/1/2/3/otherfile", kOtherData); + } + + void AssertInfoAllContainersRecursive(const std::vector& infos) { + ASSERT_EQ(infos.size(), 14); + AssertFileInfo(infos[0], "container", FileType::Directory); + AssertFileInfo(infos[1], "container/emptydir", FileType::Directory); + AssertFileInfo(infos[2], "container/otherdir", FileType::Directory); + AssertFileInfo(infos[3], "container/otherdir/1", FileType::Directory); + AssertFileInfo(infos[4], "container/otherdir/1/2", FileType::Directory); + AssertFileInfo(infos[5], "container/otherdir/1/2/3", FileType::Directory); + AssertFileInfo(infos[6], "container/otherdir/1/2/3/otherfile", FileType::File, + strlen(kOtherData)); + AssertFileInfo(infos[7], "container/somedir", FileType::Directory); + AssertFileInfo(infos[8], "container/somedir/subdir", FileType::Directory); + AssertFileInfo(infos[9], "container/somedir/subdir/subfile", FileType::File, + strlen(kSubData)); + AssertFileInfo(infos[10], "container/somefile", FileType::File, strlen(kSomeData)); + AssertFileInfo(infos[11], "empty-container", FileType::Directory); + AssertFileInfo(infos[12], PreexistingContainerName(), FileType::Directory); + AssertFileInfo(infos[13], PreexistingObjectPath(), FileType::File); + } }; class AzuriteFileSystemTest : public AzureFileSystemTest { @@ -518,6 +572,180 @@ TEST_F(AzureHierarchicalNamespaceFileSystemTest, GetFileInfoObject) { RunGetFileInfoObjectTest(); } +TEST_F(AzuriteFileSystemTest, GetFileInfoSelector) { + SetUpSmallFileSystemTree(); + + FileSelector select; + std::vector infos; + + // Root dir + select.base_dir = ""; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 3); + ASSERT_EQ(infos, SortedInfos(infos)); + AssertFileInfo(infos[0], "container", FileType::Directory); + AssertFileInfo(infos[1], "empty-container", FileType::Directory); + AssertFileInfo(infos[2], container_name_, FileType::Directory); + + // Empty container + select.base_dir = "empty-container"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 0); + // Nonexistent container + select.base_dir = "nonexistent-container"; + ASSERT_RAISES(IOError, fs_->GetFileInfo(select)); + select.allow_not_found = true; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 0); + select.allow_not_found = false; + // Non-empty container + select.base_dir = "container"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos, SortedInfos(infos)); + ASSERT_EQ(infos.size(), 4); + AssertFileInfo(infos[0], "container/emptydir", FileType::Directory); + AssertFileInfo(infos[1], "container/otherdir", FileType::Directory); + AssertFileInfo(infos[2], "container/somedir", FileType::Directory); + AssertFileInfo(infos[3], "container/somefile", FileType::File, 9); + + // Empty "directory" + select.base_dir = "container/emptydir"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 0); + // Non-empty "directories" + select.base_dir = "container/somedir"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 1); + AssertFileInfo(infos[0], "container/somedir/subdir", FileType::Directory); + select.base_dir = "container/somedir/subdir"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 1); + AssertFileInfo(infos[0], "container/somedir/subdir/subfile", FileType::File, 8); + // Nonexistent + select.base_dir = "container/nonexistent"; + ASSERT_RAISES(IOError, fs_->GetFileInfo(select)); + select.allow_not_found = true; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 0); + select.allow_not_found = false; + + // Trailing slashes + select.base_dir = "empty-container/"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 0); + select.base_dir = "nonexistent-container/"; + ASSERT_RAISES(IOError, fs_->GetFileInfo(select)); + select.base_dir = "container/"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos, SortedInfos(infos)); + ASSERT_EQ(infos.size(), 4); +} + +TEST_F(AzuriteFileSystemTest, GetFileInfoSelectorRecursive) { + SetUpSmallFileSystemTree(); + + FileSelector select; + select.recursive = true; + + std::vector infos; + // Root dir + select.base_dir = ""; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 14); + ASSERT_EQ(infos, SortedInfos(infos)); + AssertInfoAllContainersRecursive(infos); + + // Empty container + select.base_dir = "empty-container"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 0); + + // Non-empty container + select.base_dir = "container"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos, SortedInfos(infos)); + ASSERT_EQ(infos.size(), 10); + AssertFileInfo(infos[0], "container/emptydir", FileType::Directory); + AssertFileInfo(infos[1], "container/otherdir", FileType::Directory); + AssertFileInfo(infos[2], "container/otherdir/1", FileType::Directory); + AssertFileInfo(infos[3], "container/otherdir/1/2", FileType::Directory); + AssertFileInfo(infos[4], "container/otherdir/1/2/3", FileType::Directory); + AssertFileInfo(infos[5], "container/otherdir/1/2/3/otherfile", FileType::File, 10); + AssertFileInfo(infos[6], "container/somedir", FileType::Directory); + AssertFileInfo(infos[7], "container/somedir/subdir", FileType::Directory); + AssertFileInfo(infos[8], "container/somedir/subdir/subfile", FileType::File, 8); + AssertFileInfo(infos[9], "container/somefile", FileType::File, 9); + + // Empty "directory" + select.base_dir = "container/emptydir"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 0); + + // Non-empty "directories" + select.base_dir = "container/somedir"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos, SortedInfos(infos)); + ASSERT_EQ(infos.size(), 2); + AssertFileInfo(infos[0], "container/somedir/subdir", FileType::Directory); + AssertFileInfo(infos[1], "container/somedir/subdir/subfile", FileType::File, 8); + + select.base_dir = "container/otherdir"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos, SortedInfos(infos)); + ASSERT_EQ(infos.size(), 4); + AssertFileInfo(infos[0], "container/otherdir/1", FileType::Directory); + AssertFileInfo(infos[1], "container/otherdir/1/2", FileType::Directory); + AssertFileInfo(infos[2], "container/otherdir/1/2/3", FileType::Directory); + AssertFileInfo(infos[3], "container/otherdir/1/2/3/otherfile", FileType::File, 10); +} + +TEST_F(AzuriteFileSystemTest, GetFileInfoSelectorExplicitImplicitDirDedup) { + { + auto container = CreateContainer("container"); + CreateBlob(container, "mydir/emptydir1/"); + CreateBlob(container, "mydir/emptydir2/"); + CreateBlob(container, "mydir/nonemptydir1/"); // explicit dir marker + CreateBlob(container, "mydir/nonemptydir1/somefile", kSomeData); + CreateBlob(container, "mydir/nonemptydir2/somefile", kSomeData); + } + std::vector infos; + + FileSelector select; // non-recursive + select.base_dir = "container"; + + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 1); + ASSERT_EQ(infos, SortedInfos(infos)); + AssertFileInfo(infos[0], "container/mydir", FileType::Directory); + + select.base_dir = "container/mydir"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 4); + ASSERT_EQ(infos, SortedInfos(infos)); + AssertFileInfo(infos[0], "container/mydir/emptydir1", FileType::Directory); + AssertFileInfo(infos[1], "container/mydir/emptydir2", FileType::Directory); + AssertFileInfo(infos[2], "container/mydir/nonemptydir1", FileType::Directory); + AssertFileInfo(infos[3], "container/mydir/nonemptydir2", FileType::Directory); + + select.base_dir = "container/mydir/emptydir1"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 0); + + select.base_dir = "container/mydir/emptydir2"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 0); + + select.base_dir = "container/mydir/nonemptydir1"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 1); + AssertFileInfo(infos[0], "container/mydir/nonemptydir1/somefile", FileType::File); + + select.base_dir = "container/mydir/nonemptydir2"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 1); + AssertFileInfo(infos[0], "container/mydir/nonemptydir2/somefile", FileType::File); +} + TEST_F(AzuriteFileSystemTest, CreateDirFailureNoContainer) { ASSERT_RAISES(Invalid, fs_->CreateDir("", false)); } diff --git a/cpp/src/arrow/filesystem/filesystem.cc b/cpp/src/arrow/filesystem/filesystem.cc index 9ecc4610f3864..810e9c179b156 100644 --- a/cpp/src/arrow/filesystem/filesystem.cc +++ b/cpp/src/arrow/filesystem/filesystem.cc @@ -654,8 +654,7 @@ Status CopyFiles(const std::shared_ptr& source_fs, "', which is outside base dir '", source_sel.base_dir, "'"); } - auto destination_path = - internal::ConcatAbstractPath(destination_base_dir, std::string(*relative)); + auto destination_path = internal::ConcatAbstractPath(destination_base_dir, *relative); if (source_info.IsDirectory()) { dirs.push_back(destination_path); diff --git a/cpp/src/arrow/filesystem/path_util.cc b/cpp/src/arrow/filesystem/path_util.cc index 46ea436a9f31a..9c895ae76c7b8 100644 --- a/cpp/src/arrow/filesystem/path_util.cc +++ b/cpp/src/arrow/filesystem/path_util.cc @@ -52,7 +52,7 @@ std::vector SplitAbstractPath(const std::string& path, char sep) { } auto append_part = [&parts, &v](size_t start, size_t end) { - parts.push_back(std::string(v.substr(start, end - start))); + parts.emplace_back(v.substr(start, end - start)); }; size_t start = 0; @@ -72,15 +72,12 @@ std::string SliceAbstractPath(const std::string& s, int offset, int length, char return ""; } std::vector components = SplitAbstractPath(s, sep); - std::stringstream combined; if (offset >= static_cast(components.size())) { return ""; } - int end = offset + length; - if (end > static_cast(components.size())) { - end = static_cast(components.size()); - } - for (int i = offset; i < end; i++) { + const auto end = std::min(static_cast(offset) + length, components.size()); + std::stringstream combined; + for (auto i = static_cast(offset); i < end; i++) { combined << components[i]; if (i < end - 1) { combined << sep; @@ -140,16 +137,20 @@ Status ValidateAbstractPathParts(const std::vector& parts) { return Status::OK(); } -std::string ConcatAbstractPath(const std::string& base, const std::string& stem) { +std::string ConcatAbstractPath(std::string_view base, std::string_view stem) { DCHECK(!stem.empty()); if (base.empty()) { - return stem; + return std::string{stem}; } - return EnsureTrailingSlash(base) + std::string(RemoveLeadingSlash(stem)); + std::string result; + result.reserve(base.length() + stem.length() + 1); // extra 1 is for potential kSep + result += EnsureTrailingSlash(base); + result += RemoveLeadingSlash(stem); + return result; } std::string EnsureTrailingSlash(std::string_view v) { - if (v.length() > 0 && v.back() != kSep) { + if (!v.empty() && !HasTrailingSlash(v)) { // XXX How about "C:" on Windows? We probably don't want to turn it into "C:/"... // Unless the local filesystem always uses absolute paths return std::string(v) + kSep; @@ -159,7 +160,7 @@ std::string EnsureTrailingSlash(std::string_view v) { } std::string EnsureLeadingSlash(std::string_view v) { - if (v.length() == 0 || v.front() != kSep) { + if (!HasLeadingSlash(v)) { // XXX How about "C:" on Windows? We probably don't want to turn it into "/C:"... return kSep + std::string(v); } else { @@ -197,10 +198,6 @@ Status AssertNoTrailingSlash(std::string_view key) { return Status::OK(); } -bool HasTrailingSlash(std::string_view key) { return key.back() == '/'; } - -bool HasLeadingSlash(std::string_view key) { return key.front() == '/'; } - Result MakeAbstractPathRelative(const std::string& base, const std::string& path) { if (base.empty() || base.front() != kSep) { @@ -383,7 +380,7 @@ struct Globber::Impl { Globber::Globber(std::string pattern) : impl_(new Impl(pattern)) {} -Globber::~Globber() {} +Globber::~Globber() = default; bool Globber::Matches(const std::string& path) { return regex_match(path, impl_->pattern_); diff --git a/cpp/src/arrow/filesystem/path_util.h b/cpp/src/arrow/filesystem/path_util.h index 2c8c123e779f4..1da7afd3f9381 100644 --- a/cpp/src/arrow/filesystem/path_util.h +++ b/cpp/src/arrow/filesystem/path_util.h @@ -69,7 +69,7 @@ Status ValidateAbstractPathParts(const std::vector& parts); // Append a non-empty stem to an abstract path. ARROW_EXPORT -std::string ConcatAbstractPath(const std::string& base, const std::string& stem); +std::string ConcatAbstractPath(std::string_view base, std::string_view stem); // Make path relative to base, if it starts with base. Otherwise error out. ARROW_EXPORT @@ -94,11 +94,13 @@ std::string_view RemoveTrailingSlash(std::string_view s, bool preserve_root = fa ARROW_EXPORT Status AssertNoTrailingSlash(std::string_view s); -ARROW_EXPORT -bool HasTrailingSlash(std::string_view s); +inline bool HasTrailingSlash(std::string_view s) { + return !s.empty() && s.back() == kSep; +} -ARROW_EXPORT -bool HasLeadingSlash(std::string_view s); +inline bool HasLeadingSlash(std::string_view s) { + return !s.empty() && s.front() == kSep; +} ARROW_EXPORT bool IsAncestorOf(std::string_view ancestor, std::string_view descendant); diff --git a/cpp/src/arrow/filesystem/test_util.cc b/cpp/src/arrow/filesystem/test_util.cc index 6c5dda8e659df..040917dcd218a 100644 --- a/cpp/src/arrow/filesystem/test_util.cc +++ b/cpp/src/arrow/filesystem/test_util.cc @@ -126,6 +126,12 @@ void SortInfos(std::vector* infos) { std::sort(infos->begin(), infos->end(), FileInfo::ByPath{}); } +std::vector SortedInfos(const std::vector& infos) { + auto sorted = infos; + SortInfos(&sorted); + return sorted; +} + void CollectFileInfoGenerator(FileInfoGenerator gen, FileInfoVector* out_infos) { auto fut = CollectAsyncGenerator(gen); ASSERT_FINISHES_OK_AND_ASSIGN(auto nested_infos, fut); diff --git a/cpp/src/arrow/filesystem/test_util.h b/cpp/src/arrow/filesystem/test_util.h index c4d846fd31b34..62b488e159a24 100644 --- a/cpp/src/arrow/filesystem/test_util.h +++ b/cpp/src/arrow/filesystem/test_util.h @@ -74,6 +74,10 @@ void CreateFile(FileSystem* fs, const std::string& path, const std::string& data ARROW_TESTING_EXPORT void SortInfos(FileInfoVector* infos); +// Create a copy of a FileInfo vector sorted by lexicographic path order +ARROW_TESTING_EXPORT +FileInfoVector SortedInfos(const FileInfoVector& infos); + ARROW_TESTING_EXPORT void CollectFileInfoGenerator(FileInfoGenerator gen, FileInfoVector* out_infos); From b75755a2c06419abda8859e56f3bcc64f148d681 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Fri, 8 Dec 2023 20:04:03 +0100 Subject: [PATCH 006/570] GH-38479: [C++] Avoid passing null pointer to LZ4 frame decompressor (#39125) ### Rationale for this change Avoid undefined behavior in LZ4 when adding an offset to a null pointer. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * Closes: #38479 --- cpp/src/arrow/io/compressed.cc | 4 +++- cpp/src/arrow/util/compression_lz4.cc | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/io/compressed.cc b/cpp/src/arrow/io/compressed.cc index 72977f0f297f5..6c484242a4fc8 100644 --- a/cpp/src/arrow/io/compressed.cc +++ b/cpp/src/arrow/io/compressed.cc @@ -279,6 +279,8 @@ class CompressedInputStream::Impl { // Decompress some data from the compressed_ buffer. // Call this function only if the decompressed_ buffer is empty. Status DecompressData() { + DCHECK_NE(compressed_->data(), nullptr); + int64_t decompress_size = kDecompressSize; while (true) { @@ -329,7 +331,7 @@ class CompressedInputStream::Impl { // Try to feed more data into the decompressed_ buffer. Status RefillDecompressed(bool* has_data) { // First try to read data from the decompressor - if (compressed_) { + if (compressed_ && compressed_->size() != 0) { if (decompressor_->IsFinished()) { // We just went over the end of a previous compressed stream. RETURN_NOT_OK(decompressor_->Reset()); diff --git a/cpp/src/arrow/util/compression_lz4.cc b/cpp/src/arrow/util/compression_lz4.cc index 17e013c13ee0b..be957afab3c46 100644 --- a/cpp/src/arrow/util/compression_lz4.cc +++ b/cpp/src/arrow/util/compression_lz4.cc @@ -109,6 +109,7 @@ class LZ4Decompressor : public Decompressor { auto dst_capacity = static_cast(output_len); size_t ret; + DCHECK_NE(src, nullptr); ret = LZ4F_decompress(ctx_, dst, &dst_capacity, src, &src_size, nullptr /* options */); if (LZ4F_isError(ret)) { From 140ae018f372ee14c9ff19f3e4c2af1b1a579f49 Mon Sep 17 00:00:00 2001 From: Tim Schaub Date: Fri, 8 Dec 2023 20:06:32 +0100 Subject: [PATCH 007/570] GH-38506: [Go][Parquet] Add NumRows and RowGroupNumRows to pqarrow.FileWriter (#38507) ### Rationale for this change When using a chunked column reader to read from one Parquet file and a chunked column writer to write to another Parquet file, it can be useful to keep track of the number of rows written. ### What changes are included in this PR? This branch adds a new `RowGroupNumRows` method to the `pqarrow.FileWriter`. This is somewhat similar to the existing `RowGroupTotalBytesWritten` function. ### Are these changes tested? A new `file_writer_test.go` file is added that adds a test for the new method. ### Are there any user-facing changes? The new method is exported and documented. * Closes: #38506 Authored-by: Tim Schaub Signed-off-by: Matt Topol --- go/parquet/pqarrow/file_writer.go | 17 +++++ go/parquet/pqarrow/file_writer_test.go | 89 ++++++++++++++++++++++++++ 2 files changed, 106 insertions(+) create mode 100644 go/parquet/pqarrow/file_writer_test.go diff --git a/go/parquet/pqarrow/file_writer.go b/go/parquet/pqarrow/file_writer.go index bc484ba243f87..1164cd690c399 100644 --- a/go/parquet/pqarrow/file_writer.go +++ b/go/parquet/pqarrow/file_writer.go @@ -134,6 +134,23 @@ func (fw *FileWriter) RowGroupTotalBytesWritten() int64 { return 0 } +// RowGroupNumRows returns the number of rows written to the current row group. +// Returns an error if they are unequal between columns that have been written so far. +func (fw *FileWriter) RowGroupNumRows() (int, error) { + if fw.rgw != nil { + return fw.rgw.NumRows() + } + return 0, nil +} + +// NumRows returns the total number of rows that have been written so far. +func (fw *FileWriter) NumRows() int { + if fw.wr != nil { + return fw.wr.NumRows() + } + return 0 +} + // WriteBuffered will either append to an existing row group or create a new one // based on the record length and max row group length. // diff --git a/go/parquet/pqarrow/file_writer_test.go b/go/parquet/pqarrow/file_writer_test.go new file mode 100644 index 0000000000000..0b76733a62876 --- /dev/null +++ b/go/parquet/pqarrow/file_writer_test.go @@ -0,0 +1,89 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pqarrow_test + +import ( + "bytes" + "strings" + "testing" + + "github.com/apache/arrow/go/v15/arrow" + "github.com/apache/arrow/go/v15/arrow/array" + "github.com/apache/arrow/go/v15/arrow/memory" + "github.com/apache/arrow/go/v15/parquet" + "github.com/apache/arrow/go/v15/parquet/pqarrow" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestFileWriterRowGroupNumRows(t *testing.T) { + schema := arrow.NewSchema([]arrow.Field{ + {Name: "one", Nullable: true, Type: arrow.PrimitiveTypes.Float64}, + {Name: "two", Nullable: true, Type: arrow.PrimitiveTypes.Float64}, + }, nil) + + data := `[ + {"one": 1, "two": 2}, + {"one": 1, "two": null}, + {"one": null, "two": 2}, + {"one": null, "two": null} + ]` + record, _, err := array.RecordFromJSON(memory.DefaultAllocator, schema, strings.NewReader(data)) + require.NoError(t, err) + + output := &bytes.Buffer{} + writerProps := parquet.NewWriterProperties(parquet.WithMaxRowGroupLength(100)) + writer, err := pqarrow.NewFileWriter(schema, output, writerProps, pqarrow.DefaultWriterProps()) + require.NoError(t, err) + + require.NoError(t, writer.Write(record)) + numRows, err := writer.RowGroupNumRows() + require.NoError(t, err) + assert.Equal(t, 4, numRows) + require.NoError(t, writer.Close()) +} + +func TestFileWriterNumRows(t *testing.T) { + schema := arrow.NewSchema([]arrow.Field{ + {Name: "one", Nullable: true, Type: arrow.PrimitiveTypes.Float64}, + {Name: "two", Nullable: true, Type: arrow.PrimitiveTypes.Float64}, + }, nil) + + data := `[ + {"one": 1, "two": 2}, + {"one": 1, "two": null}, + {"one": null, "two": 2}, + {"one": null, "two": null} + ]` + record, _, err := array.RecordFromJSON(memory.DefaultAllocator, schema, strings.NewReader(data)) + require.NoError(t, err) + + maxRowGroupLength := 2 + + output := &bytes.Buffer{} + writerProps := parquet.NewWriterProperties(parquet.WithMaxRowGroupLength(int64(maxRowGroupLength))) + writer, err := pqarrow.NewFileWriter(schema, output, writerProps, pqarrow.DefaultWriterProps()) + require.NoError(t, err) + + require.NoError(t, writer.Write(record)) + rowGroupNumRows, err := writer.RowGroupNumRows() + require.NoError(t, err) + assert.Equal(t, maxRowGroupLength, rowGroupNumRows) + + require.NoError(t, writer.Close()) + assert.Equal(t, 4, writer.NumRows()) +} From 4b1f06327f05341b6e51293b3186d80cd5fdbf87 Mon Sep 17 00:00:00 2001 From: Dane Pitkin <48041712+danepitkin@users.noreply.github.com> Date: Fri, 8 Dec 2023 14:41:33 -0500 Subject: [PATCH 008/570] MINOR: [Java] Bump ch.qos.logback:logback-classic from 1.2.13 to 1.3.14 in /java (#39145) ### Rationale for this change Raised by dependabot, but dependabot didn't upgrade to the correct version for Arrow Java and did not upgrade the dependency. ### What changes are included in this PR? * logback 1.2.13 -> 1.3.14 * slf4j 1.7.25 -> 2.0.7 (required by logback 1.3.14) ### Are these changes tested? CI ### Are there any user-facing changes? No Authored-by: Dane Pitkin Signed-off-by: Sutou Kouhei --- java/memory/memory-netty/pom.xml | 2 +- java/pom.xml | 6 +++--- java/tools/pom.xml | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/java/memory/memory-netty/pom.xml b/java/memory/memory-netty/pom.xml index 88a0436eb7175..e625cbeabc65a 100644 --- a/java/memory/memory-netty/pom.xml +++ b/java/memory/memory-netty/pom.xml @@ -41,7 +41,7 @@ ch.qos.logback logback-core - 1.2.13 + 1.3.14 test diff --git a/java/pom.xml b/java/pom.xml index cd16b862d10c0..cd26e79d47f3d 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -31,7 +31,7 @@ ${project.build.directory}/generated-sources 1.9.0 5.10.1 - 1.7.25 + 2.0.7 32.1.3-jre 4.1.100.Final 1.59.0 @@ -308,7 +308,7 @@ org.slf4j jcl-over-slf4j - 1.7.5 + ${dep.slf4j.version} @@ -698,7 +698,7 @@ ch.qos.logback logback-classic - 1.2.13 + 1.3.14 test diff --git a/java/tools/pom.xml b/java/tools/pom.xml index 1815c39227de9..8ea98a84b4ad1 100644 --- a/java/tools/pom.xml +++ b/java/tools/pom.xml @@ -47,7 +47,7 @@ ch.qos.logback logback-classic - 1.2.13 + 1.3.14 runtime From ad63158e74d903c263c51cd0207cf77f8aa12ede Mon Sep 17 00:00:00 2001 From: abandy Date: Fri, 8 Dec 2023 14:50:22 -0700 Subject: [PATCH 009/570] GH-37884: [Swift] allow reading of unaligned FlatBuffers buffers (#38635) The PR enables the swift readers to read from unaligned buffers (fix for issue: 37884) Enabling unaligned buffers incurs a performance penalty so the developer will need to consider this when enabling this feature. It is not currently possible to recover from a buffer unaligned error as this error is a fatalError so trying aligned and then falling back to unaligned is not an option. Also, FlatBuffers has a verifier that should be able to catch this error but currently it seems to fail on both aligned and unaligned buffers (I tried verifying the example python server get return value but verification fails even though the buffers are able to be read successfully) * Closes: #37884 Authored-by: Alva Bandy Signed-off-by: Sutou Kouhei --- swift/Arrow/Package.swift | 6 +++++- swift/Arrow/Sources/Arrow/ArrowReader.swift | 18 +++++++++++++----- .../Sources/ArrowFlight/FlightClient.swift | 11 +++++++++-- .../Sources/ArrowFlight/FlightServer.swift | 7 +++++++ .../ArrowFlight/RecordBatchStreamReader.swift | 11 +++++++++-- 5 files changed, 43 insertions(+), 10 deletions(-) diff --git a/swift/Arrow/Package.swift b/swift/Arrow/Package.swift index 065afe62640ea..946eb999c798a 100644 --- a/swift/Arrow/Package.swift +++ b/swift/Arrow/Package.swift @@ -32,7 +32,11 @@ let package = Package( targets: ["Arrow"]), ], dependencies: [ - .package(url: "https://github.com/google/flatbuffers.git", from: "23.3.3") + // The latest version of flatbuffers v23.5.26 was built in May 26, 2023 + // and therefore doesn't include the unaligned buffer swift changes. + // This can be changed back to using the tag once a new version of + // flatbuffers has been released. + .package(url: "https://github.com/google/flatbuffers.git", branch: "master") ], targets: [ // Targets are the basic building blocks of a package. A target can define a module or a test suite. diff --git a/swift/Arrow/Sources/Arrow/ArrowReader.swift b/swift/Arrow/Sources/Arrow/ArrowReader.swift index ef995b18052a8..d9dc1bdb470e6 100644 --- a/swift/Arrow/Sources/Arrow/ArrowReader.swift +++ b/swift/Arrow/Sources/Arrow/ArrowReader.swift @@ -132,7 +132,8 @@ public class ArrowReader { } public func fromStream( // swiftlint:disable:this function_body_length - _ fileData: Data + _ fileData: Data, + useUnalignedBuffers: Bool = false ) -> Result { let footerLength = fileData.withUnsafeBytes { rawBuffer in rawBuffer.loadUnaligned(fromByteOffset: fileData.count - 4, as: Int32.self) @@ -141,7 +142,9 @@ public class ArrowReader { let result = ArrowReaderResult() let footerStartOffset = fileData.count - Int(footerLength + 4) let footerData = fileData[footerStartOffset...] - let footerBuffer = ByteBuffer(data: footerData) + let footerBuffer = ByteBuffer( + data: footerData, + allowReadingUnalignedBuffers: useUnalignedBuffers) let footer = org_apache_arrow_flatbuf_Footer.getRootAsFooter(bb: footerBuffer) let schemaResult = loadSchema(footer.schema!) switch schemaResult { @@ -170,7 +173,9 @@ public class ArrowReader { let messageStartOffset = recordBatch.offset + (Int64(MemoryLayout.size) * messageOffset) let messageEndOffset = messageStartOffset + Int64(messageLength) let recordBatchData = fileData[messageStartOffset ..< messageEndOffset] - let mbb = ByteBuffer(data: recordBatchData) + let mbb = ByteBuffer( + data: recordBatchData, + allowReadingUnalignedBuffers: useUnalignedBuffers) let message = org_apache_arrow_flatbuf_Message.getRootAsMessage(bb: mbb) switch message.headerType { case .recordbatch: @@ -219,9 +224,12 @@ public class ArrowReader { public func fromMessage( _ dataHeader: Data, dataBody: Data, - result: ArrowReaderResult + result: ArrowReaderResult, + useUnalignedBuffers: Bool = false ) -> Result { - let mbb = ByteBuffer(data: dataHeader) + let mbb = ByteBuffer( + data: dataHeader, + allowReadingUnalignedBuffers: useUnalignedBuffers) let message = org_apache_arrow_flatbuf_Message.getRootAsMessage(bb: mbb) switch message.headerType { case .schema: diff --git a/swift/ArrowFlight/Sources/ArrowFlight/FlightClient.swift b/swift/ArrowFlight/Sources/ArrowFlight/FlightClient.swift index 7a572ceca5bd6..ef3e4fa239e84 100644 --- a/swift/ArrowFlight/Sources/ArrowFlight/FlightClient.swift +++ b/swift/ArrowFlight/Sources/ArrowFlight/FlightClient.swift @@ -24,8 +24,11 @@ import Arrow public class FlightClient { let client: Arrow_Flight_Protocol_FlightServiceAsyncClient - public init(channel: GRPCChannel) { + let allowReadingUnalignedBuffers: Bool + + public init(channel: GRPCChannel, allowReadingUnalignedBuffers: Bool = false ) { client = Arrow_Flight_Protocol_FlightServiceAsyncClient(channel: channel) + self.allowReadingUnalignedBuffers = allowReadingUnalignedBuffers } private func readMessages( @@ -34,7 +37,11 @@ public class FlightClient { let reader = ArrowReader() let arrowResult = ArrowReader.makeArrowReaderResult() for try await data in responseStream { - switch reader.fromMessage(data.dataHeader, dataBody: data.dataBody, result: arrowResult) { + switch reader.fromMessage( + data.dataHeader, + dataBody: data.dataBody, + result: arrowResult, + useUnalignedBuffers: allowReadingUnalignedBuffers) { case .success: continue case .failure(let error): diff --git a/swift/ArrowFlight/Sources/ArrowFlight/FlightServer.swift b/swift/ArrowFlight/Sources/ArrowFlight/FlightServer.swift index a34bf5c0acee9..19644d632e997 100644 --- a/swift/ArrowFlight/Sources/ArrowFlight/FlightServer.swift +++ b/swift/ArrowFlight/Sources/ArrowFlight/FlightServer.swift @@ -63,6 +63,7 @@ public func schemaFromMessage(_ schemaData: Data) -> ArrowSchema? { } public protocol ArrowFlightServer: Sendable { + var allowReadingUnalignedBuffers: Bool { get } func listFlights(_ criteria: FlightCriteria, writer: FlightInfoStreamWriter) async throws func getFlightInfo(_ request: FlightDescriptor) async throws -> FlightInfo func getSchema(_ request: FlightDescriptor) async throws -> ArrowFlight.FlightSchemaResult @@ -73,6 +74,12 @@ public protocol ArrowFlightServer: Sendable { func doExchange(_ reader: RecordBatchStreamReader, writer: RecordBatchStreamWriter) async throws } +extension ArrowFlightServer { + var allowReadingUnalignedBuffers: Bool { + return false + } +} + public func makeFlightServer(_ handler: ArrowFlightServer) -> CallHandlerProvider { return InternalFlightServer(handler) } diff --git a/swift/ArrowFlight/Sources/ArrowFlight/RecordBatchStreamReader.swift b/swift/ArrowFlight/Sources/ArrowFlight/RecordBatchStreamReader.swift index 972d19435ddfc..464752dbcbeea 100644 --- a/swift/ArrowFlight/Sources/ArrowFlight/RecordBatchStreamReader.swift +++ b/swift/ArrowFlight/Sources/ArrowFlight/RecordBatchStreamReader.swift @@ -27,10 +27,13 @@ public class RecordBatchStreamReader: AsyncSequence, AsyncIteratorProtocol { var descriptor: FlightDescriptor? var batchIndex = 0 var streamIterator: any AsyncIteratorProtocol + var useUnalignedBuffers: Bool let stream: GRPC.GRPCAsyncRequestStream - init(_ stream: GRPC.GRPCAsyncRequestStream) { + init(_ stream: GRPC.GRPCAsyncRequestStream, + useUnalignedBuffers: Bool = false) { self.stream = stream self.streamIterator = self.stream.makeAsyncIterator() + self.useUnalignedBuffers = useUnalignedBuffers } public func next() async throws -> (Arrow.RecordBatch?, FlightDescriptor?)? { @@ -55,7 +58,11 @@ public class RecordBatchStreamReader: AsyncSequence, AsyncIteratorProtocol { let dataBody = flightData.dataBody let dataHeader = flightData.dataHeader descriptor = FlightDescriptor(flightData.flightDescriptor) - switch reader.fromMessage(dataHeader, dataBody: dataBody, result: result) { + switch reader.fromMessage( + dataHeader, + dataBody: dataBody, + result: result, + useUnalignedBuffers: useUnalignedBuffers) { case .success(()): if result.batches.count > 0 { batches = result.batches From 8a644afc77ebe6333114e503cab29f9b0969618a Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sat, 9 Dec 2023 07:01:31 +0900 Subject: [PATCH 010/570] GH-39136: [C++] Remove needless system Protobuf dependency with -DARROW_HDFS=ON (#39137) ### Rationale for this change Our HDFS related codes don't depend on Protobuf because we process HDFS via external `libhdfs.so` and it's `dlopen()`-ed. ### What changes are included in this PR? Remove a needless CMake configuration. ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. * Closes: #39136 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- cpp/CMakeLists.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index bcb298407bd8b..9f17350b2505a 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -873,9 +873,6 @@ add_dependencies(arrow_test_dependencies toolchain-tests) if(ARROW_STATIC_LINK_LIBS) add_dependencies(arrow_dependencies ${ARROW_STATIC_LINK_LIBS}) if(ARROW_HDFS OR ARROW_ORC) - if(Protobuf_SOURCE STREQUAL "SYSTEM") - list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS ${ARROW_PROTOBUF_LIBPROTOBUF}) - endif() if(NOT MSVC_TOOLCHAIN) list(APPEND ARROW_STATIC_LINK_LIBS ${CMAKE_DL_LIBS}) list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS ${CMAKE_DL_LIBS}) From 31d2afc28a201bda78da8b0229e823413ff82e0d Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Sat, 9 Dec 2023 16:39:51 +0100 Subject: [PATCH 011/570] GH-39126: [C++][CI] Fix Valgrind failures (#39127) ### Rationale for this change ### What changes are included in this PR? ### Are these changes tested? ### Are there any user-facing changes? * Closes: #39126 Lead-authored-by: Antoine Pitrou Co-authored-by: Antoine Pitrou Co-authored-by: Benjamin Kietzman Signed-off-by: Antoine Pitrou --- cpp/src/arrow/array/array_dict_test.cc | 2 +- cpp/src/arrow/array/array_test.cc | 1 + cpp/src/arrow/array/builder_binary.cc | 9 ++++---- cpp/src/arrow/array/builder_binary.h | 31 +++++++++++++++++--------- 4 files changed, 27 insertions(+), 16 deletions(-) diff --git a/cpp/src/arrow/array/array_dict_test.cc b/cpp/src/arrow/array/array_dict_test.cc index 2f3ee6e2d49a5..a4c03b5db6371 100644 --- a/cpp/src/arrow/array/array_dict_test.cc +++ b/cpp/src/arrow/array/array_dict_test.cc @@ -1129,7 +1129,7 @@ TEST(TestDictionary, Validate) { arr = std::make_shared(dict_type, indices, MakeArray(invalid_data)); ASSERT_RAISES(Invalid, arr->ValidateFull()); -#if !defined(__APPLE__) +#if !defined(__APPLE__) && !defined(ARROW_VALGRIND) // GH-35712: ASSERT_DEATH would make testing slow on MacOS. ASSERT_DEATH( { diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 974eb54d2caca..e9d478f108584 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -728,6 +728,7 @@ TEST_F(TestArray, TestMakeArrayFromScalar) { } for (auto scalar : scalars) { + ARROW_SCOPED_TRACE("scalar type: ", scalar->type->ToString()); AssertAppendScalar(pool_, scalar); } } diff --git a/cpp/src/arrow/array/builder_binary.cc b/cpp/src/arrow/array/builder_binary.cc index 3ff22d4a3feeb..f85852fa0eda6 100644 --- a/cpp/src/arrow/array/builder_binary.cc +++ b/cpp/src/arrow/array/builder_binary.cc @@ -80,10 +80,11 @@ Status BinaryViewBuilder::AppendArraySlice(const ArraySpan& array, int64_t offse Status BinaryViewBuilder::FinishInternal(std::shared_ptr* out) { ARROW_ASSIGN_OR_RAISE(auto null_bitmap, null_bitmap_builder_.FinishWithLength(length_)); ARROW_ASSIGN_OR_RAISE(auto data, data_builder_.FinishWithLength(length_)); - BufferVector buffers = {null_bitmap, data}; - for (auto&& buffer : data_heap_builder_.Finish()) { - buffers.push_back(std::move(buffer)); - } + ARROW_ASSIGN_OR_RAISE(auto byte_buffers, data_heap_builder_.Finish()); + BufferVector buffers(byte_buffers.size() + 2); + buffers[0] = std::move(null_bitmap); + buffers[1] = std::move(data); + std::move(byte_buffers.begin(), byte_buffers.end(), buffers.begin() + 2); *out = ArrayData::Make(type(), length_, std::move(buffers), null_count_); Reset(); return Status::OK(); diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index 3e87cf2403610..d825f7d32520a 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -524,16 +524,11 @@ class ARROW_EXPORT StringHeapBuilder { "strings larger than 2GB"); } if (num_bytes > current_remaining_bytes_) { - // Ensure the buffer is fully overwritten to avoid leaking uninitialized - // bytes from the allocator - if (current_remaining_bytes_ > 0) { - std::memset(current_out_buffer_, 0, current_remaining_bytes_); - blocks_.back() = SliceBuffer(blocks_.back(), 0, - blocks_.back()->size() - current_remaining_bytes_); - } + ARROW_RETURN_NOT_OK(FinishLastBlock()); current_remaining_bytes_ = num_bytes > blocksize_ ? num_bytes : blocksize_; - ARROW_ASSIGN_OR_RAISE(std::shared_ptr new_block, - AllocateBuffer(current_remaining_bytes_, alignment_, pool_)); + ARROW_ASSIGN_OR_RAISE( + std::shared_ptr new_block, + AllocateResizableBuffer(current_remaining_bytes_, alignment_, pool_)); current_offset_ = 0; current_out_buffer_ = new_block->mutable_data(); blocks_.emplace_back(std::move(new_block)); @@ -550,7 +545,10 @@ class ARROW_EXPORT StringHeapBuilder { int64_t current_remaining_bytes() const { return current_remaining_bytes_; } - std::vector> Finish() { + Result>> Finish() { + if (!blocks_.empty()) { + ARROW_RETURN_NOT_OK(FinishLastBlock()); + } current_offset_ = 0; current_out_buffer_ = NULLPTR; current_remaining_bytes_ = 0; @@ -558,10 +556,21 @@ class ARROW_EXPORT StringHeapBuilder { } private: + Status FinishLastBlock() { + if (current_remaining_bytes_ > 0) { + // Avoid leaking uninitialized bytes from the allocator + ARROW_RETURN_NOT_OK( + blocks_.back()->Resize(blocks_.back()->size() - current_remaining_bytes_, + /*shrink_to_fit=*/true)); + blocks_.back()->ZeroPadding(); + } + return Status::OK(); + } + MemoryPool* pool_; int64_t alignment_; int64_t blocksize_ = kDefaultBlocksize; - std::vector> blocks_; + std::vector> blocks_; int32_t current_offset_ = 0; uint8_t* current_out_buffer_ = NULLPTR; From 20c975d03f8db85a0a3adea2e384b2291fb56da3 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Sat, 9 Dec 2023 17:22:51 +0100 Subject: [PATCH 012/570] GH-39122: [C++][Parquet] Optimize FLBA record reader (#39124) ### Rationale for this change The FLBA implementation of RecordReader is suboptimal: * it doesn't preallocate the output array * it reads the decoded validity bitmap one bit at a time and recreates it, one bit at a time ### What changes are included in this PR? Optimize the FLBA implementation of RecordReader so as to avoid the aforementioned inefficiencies. I did a quick-and-dirty benchmark on a Parquet file with two columns: * column 1: uncompressed, PLAIN-encoded, FLBA<3> with no nulls * column 2: uncompressed, PLAIN-encoded, FLBA<3> with 25% nulls With git main, the file can be read at 465 MB/s. With this PR, the file can be read at 700 MB/s. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * Closes: #39122 Lead-authored-by: Antoine Pitrou Co-authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/parquet/column_reader.cc | 70 +++++++++++++++++++++++--------- 1 file changed, 50 insertions(+), 20 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index ecc48811e46fc..a49e58afbdb83 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -30,6 +30,7 @@ #include #include "arrow/array.h" +#include "arrow/array/array_binary.h" #include "arrow/array/builder_binary.h" #include "arrow/array/builder_dict.h" #include "arrow/array/builder_primitive.h" @@ -2040,23 +2041,29 @@ class TypedRecordReader : public TypedColumnReaderImpl, LevelInfo leaf_info_; }; -class FLBARecordReader : public TypedRecordReader, - virtual public BinaryRecordReader { +class FLBARecordReader final : public TypedRecordReader, + virtual public BinaryRecordReader { public: FLBARecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info, ::arrow::MemoryPool* pool, bool read_dense_for_nullable) : TypedRecordReader(descr, leaf_info, pool, read_dense_for_nullable), - builder_(nullptr) { + byte_width_(descr_->type_length()), + empty_(byte_width_, 0), + type_(::arrow::fixed_size_binary(byte_width_)), + null_bitmap_builder_(pool), + data_builder_(pool) { ARROW_DCHECK_EQ(descr_->physical_type(), Type::FIXED_LEN_BYTE_ARRAY); - int byte_width = descr_->type_length(); - std::shared_ptr<::arrow::DataType> type = ::arrow::fixed_size_binary(byte_width); - builder_ = std::make_unique<::arrow::FixedSizeBinaryBuilder>(type, this->pool_); } ::arrow::ArrayVector GetBuilderChunks() override { - std::shared_ptr<::arrow::Array> chunk; - PARQUET_THROW_NOT_OK(builder_->Finish(&chunk)); - return ::arrow::ArrayVector({chunk}); + const int64_t null_count = null_bitmap_builder_.false_count(); + const int64_t length = null_bitmap_builder_.length(); + ARROW_DCHECK_EQ(length * byte_width_, data_builder_.length()); + PARQUET_ASSIGN_OR_THROW(auto data_buffer, data_builder_.Finish()); + PARQUET_ASSIGN_OR_THROW(auto null_bitmap, null_bitmap_builder_.Finish()); + auto chunk = std::make_shared<::arrow::FixedSizeBinaryArray>( + type_, length, data_buffer, null_bitmap, null_count); + return ::arrow::ArrayVector({std::move(chunk)}); } void ReadValuesDense(int64_t values_to_read) override { @@ -2065,9 +2072,9 @@ class FLBARecordReader : public TypedRecordReader, this->current_decoder_->Decode(values, static_cast(values_to_read)); CheckNumberDecoded(num_decoded, values_to_read); - for (int64_t i = 0; i < num_decoded; i++) { - PARQUET_THROW_NOT_OK(builder_->Append(values[i].ptr)); - } + PARQUET_THROW_NOT_OK(null_bitmap_builder_.Reserve(num_decoded)); + PARQUET_THROW_NOT_OK(data_builder_.Reserve(num_decoded * byte_width_)); + UnsafeAppendDense(values, num_decoded); ResetValues(); } @@ -2081,22 +2088,45 @@ class FLBARecordReader : public TypedRecordReader, valid_bits, valid_bits_offset); ARROW_DCHECK_EQ(num_decoded, values_to_read); + PARQUET_THROW_NOT_OK(null_bitmap_builder_.Reserve(num_decoded)); + PARQUET_THROW_NOT_OK(data_builder_.Reserve(num_decoded * byte_width_)); + if (null_count == 0) { + UnsafeAppendDense(values, num_decoded); + } else { + UnsafeAppendSpaced(values, num_decoded, valid_bits, valid_bits_offset); + } + ResetValues(); + } + + void UnsafeAppendDense(const FLBA* values, int64_t num_decoded) { + null_bitmap_builder_.UnsafeAppend(num_decoded, /*value=*/true); + for (int64_t i = 0; i < num_decoded; i++) { + data_builder_.UnsafeAppend(values[i].ptr, byte_width_); + } + } + + void UnsafeAppendSpaced(const FLBA* values, int64_t num_decoded, + const uint8_t* valid_bits, int64_t valid_bits_offset) { + null_bitmap_builder_.UnsafeAppend(valid_bits, valid_bits_offset, num_decoded); for (int64_t i = 0; i < num_decoded; i++) { if (::arrow::bit_util::GetBit(valid_bits, valid_bits_offset + i)) { - PARQUET_THROW_NOT_OK(builder_->Append(values[i].ptr)); + data_builder_.UnsafeAppend(values[i].ptr, byte_width_); } else { - PARQUET_THROW_NOT_OK(builder_->AppendNull()); + data_builder_.UnsafeAppend(empty_.data(), byte_width_); } } - ResetValues(); } private: - std::unique_ptr<::arrow::FixedSizeBinaryBuilder> builder_; + const int byte_width_; + const std::vector empty_; + std::shared_ptr<::arrow::DataType> type_; + ::arrow::TypedBufferBuilder null_bitmap_builder_; + ::arrow::BufferBuilder data_builder_; }; -class ByteArrayChunkedRecordReader : public TypedRecordReader, - virtual public BinaryRecordReader { +class ByteArrayChunkedRecordReader final : public TypedRecordReader, + virtual public BinaryRecordReader { public: ByteArrayChunkedRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info, ::arrow::MemoryPool* pool, bool read_dense_for_nullable) @@ -2137,8 +2167,8 @@ class ByteArrayChunkedRecordReader : public TypedRecordReader, typename EncodingTraits::Accumulator accumulator_; }; -class ByteArrayDictionaryRecordReader : public TypedRecordReader, - virtual public DictionaryRecordReader { +class ByteArrayDictionaryRecordReader final : public TypedRecordReader, + virtual public DictionaryRecordReader { public: ByteArrayDictionaryRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info, ::arrow::MemoryPool* pool, bool read_dense_for_nullable) From 47f4d18ec5c42a9652d9f9bec18adb9cf5fb0e55 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Sat, 9 Dec 2023 21:15:20 -0300 Subject: [PATCH 013/570] GH-38702 [C++]: Implement AzureFileSystem::DeleteRootDirContents (#39151) ### Rationale for this change This copies the behavior implemented by S3FileSystem. ### What changes are included in this PR? An implementation of `DeleteRootDirContent` that prevents deletion of all blob containers. ### Are these changes tested? N/A. * Closes: #38702 Authored-by: Felipe Oliveira Carvalho Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/filesystem/azurefs.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index daababb04c172..824a8fb531483 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -1427,7 +1427,7 @@ Status AzureFileSystem::DeleteDirContents(const std::string& path, bool missing_ } Status AzureFileSystem::DeleteRootDirContents() { - return Status::NotImplemented("The Azure FileSystem is not fully implemented"); + return Status::NotImplemented("Cannot delete all Azure Blob Storage containers"); } Status AzureFileSystem::DeleteFile(const std::string& path) { From cca5eec5fd853d4593dfe1b6c158e9543d32619f Mon Sep 17 00:00:00 2001 From: Jin Shang Date: Sun, 10 Dec 2023 09:57:56 +0800 Subject: [PATCH 014/570] GH-39156: [C++][Compute] Fix negative duration division (#39158) ### Rationale for this change I forgot to cast durations to doubles in the current `division(duration, duration)` kernel. So they were essentially `reinterpret_cast`ed to double. Because I only tested small positive ints but not large ints or negative ints, I missed this bug. ### What changes are included in this PR? Add a `FloatingDivide` operator that casts ints to doubles and do floating division. Replace the `division(duration, duration)` with this op. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * Closes: #39156 Authored-by: Jin Shang Signed-off-by: Sutou Kouhei --- .../kernels/base_arithmetic_internal.h | 38 +++++++++++++++++++ .../compute/kernels/scalar_arithmetic.cc | 7 ++-- .../compute/kernels/scalar_temporal_test.cc | 8 ++-- 3 files changed, 46 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/base_arithmetic_internal.h b/cpp/src/arrow/compute/kernels/base_arithmetic_internal.h index 7798c615777a4..d59320d270e4f 100644 --- a/cpp/src/arrow/compute/kernels/base_arithmetic_internal.h +++ b/cpp/src/arrow/compute/kernels/base_arithmetic_internal.h @@ -426,6 +426,44 @@ struct DivideChecked { } }; +struct FloatingDivide { + template + static enable_if_floating_value Call(KernelContext*, Arg0 left, Arg1 right, + Status*) { + return left / right; + } + + template + static enable_if_integer_value Call(KernelContext* ctx, Arg0 left, + Arg1 right, Status* st) { + static_assert(std::is_same::value); + return Call(ctx, static_cast(left), static_cast(right), st); + } + + // TODO: Add decimal +}; + +struct FloatingDivideChecked { + template + static enable_if_floating_value Call(KernelContext*, Arg0 left, Arg1 right, + Status* st) { + static_assert(std::is_same::value && std::is_same::value); + if (ARROW_PREDICT_FALSE(right == 0)) { + *st = Status::Invalid("divide by zero"); + return 0; + } + return left / right; + } + + template + static enable_if_integer_value Call(KernelContext* ctx, Arg0 left, + Arg1 right, Status* st) { + static_assert(std::is_same::value); + return Call(ctx, static_cast(left), static_cast(right), st); + } + // TODO: Add decimal +}; + struct Negate { template static constexpr enable_if_floating_value Call(KernelContext*, Arg arg, Status*) { diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc index c305028be19c9..ad33d7f8951f4 100644 --- a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc +++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc @@ -1513,7 +1513,8 @@ void RegisterScalarArithmetic(FunctionRegistry* registry) { // Add divide(duration, duration) -> float64 for (auto unit : TimeUnit::values()) { - auto exec = ScalarBinaryNotNull::Exec; + auto exec = + ScalarBinaryNotNull::Exec; DCHECK_OK( divide->AddKernel({duration(unit), duration(unit)}, float64(), std::move(exec))); } @@ -1533,8 +1534,8 @@ void RegisterScalarArithmetic(FunctionRegistry* registry) { // Add divide_checked(duration, duration) -> float64 for (auto unit : TimeUnit::values()) { - auto exec = - ScalarBinaryNotNull::Exec; + auto exec = ScalarBinaryNotNull::Exec; DCHECK_OK(divide_checked->AddKernel({duration(unit), duration(unit)}, float64(), std::move(exec))); } diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc index 4c7975add0308..d8bbe5ca8a34c 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc @@ -1722,12 +1722,12 @@ TEST_F(ScalarTemporalTest, TestTemporalDivideDuration) { } // div(duration, duration) -> float64 - auto left = ArrayFromJSON(duration(TimeUnit::SECOND), "[1, 2, 3, 4]"); - auto right = ArrayFromJSON(duration(TimeUnit::MILLI), "[4000, 300, 20, 1]"); + auto left = ArrayFromJSON(duration(TimeUnit::SECOND), "[1, 2, -3, 4]"); + auto right = ArrayFromJSON(duration(TimeUnit::MILLI), "[4000, -300, 20, 1]"); auto expected_left_by_right = - ArrayFromJSON(float64(), "[0.25, 6.666666666666667, 150, 4000]"); + ArrayFromJSON(float64(), "[0.25, -6.666666666666667, -150, 4000]"); auto expected_right_by_left = - ArrayFromJSON(float64(), "[4, 0.15, 0.006666666666666667, 0.00025]"); + ArrayFromJSON(float64(), "[4, -0.15, -0.006666666666666667, 0.00025]"); CheckScalarBinary("divide", left, right, expected_left_by_right); CheckScalarBinary("divide_checked", left, right, expected_left_by_right); CheckScalarBinary("divide", right, left, expected_right_by_left); From 4841cdaf9f336bdcbe31aff02ebc32e218ab84db Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sun, 10 Dec 2023 07:16:18 -0500 Subject: [PATCH 015/570] GH-38979: [C++] Fix spelling (#38980) ### Rationale for this change ### What changes are included in this PR? Spelling fixes to cpp/src/ ### Are these changes tested? ### Are there any user-facing changes? * Closes: #38979 Authored-by: Josh Soref <2119212+jsoref@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- cpp/src/arrow/adapters/orc/adapter.h | 6 +++--- cpp/src/arrow/c/bridge_test.cc | 4 ++-- cpp/src/arrow/chunked_array_test.cc | 2 +- .../kernels/vector_selection_filter_internal.cc | 2 +- cpp/src/arrow/csv/lexing_internal.h | 2 +- cpp/src/arrow/csv/reader.cc | 2 +- cpp/src/arrow/csv/writer_benchmark.cc | 2 +- .../engine/substrait/extended_expression_internal.cc | 2 +- cpp/src/arrow/engine/substrait/extension_set.h | 2 +- cpp/src/arrow/engine/substrait/options.cc | 2 +- cpp/src/arrow/engine/substrait/serde_test.cc | 8 ++++---- cpp/src/arrow/engine/substrait/visibility.h | 2 +- cpp/src/arrow/extension/fixed_shape_tensor_test.cc | 4 ++-- cpp/src/arrow/field_ref_test.cc | 2 +- cpp/src/arrow/integration/json_integration.h | 2 +- cpp/src/arrow/io/file_benchmark.cc | 2 +- cpp/src/arrow/io/interfaces.h | 2 +- cpp/src/arrow/ipc/metadata_internal.cc | 2 +- cpp/src/arrow/ipc/read_write_test.cc | 10 +++++----- cpp/src/arrow/ipc/reader.h | 2 +- cpp/src/arrow/json/converter_test.cc | 2 +- cpp/src/arrow/json/reader.h | 2 +- cpp/src/arrow/table_test.cc | 6 +++--- cpp/src/arrow/testing/util.cc | 2 +- cpp/src/arrow/type_test.cc | 2 +- cpp/src/generated/Schema_generated.h | 2 +- cpp/src/skyhook/CMakeLists.txt | 2 +- cpp/src/skyhook/protocol/rados_protocol.h | 2 +- cpp/src/skyhook/protocol/skyhook_protocol.h | 2 +- 29 files changed, 42 insertions(+), 42 deletions(-) diff --git a/cpp/src/arrow/adapters/orc/adapter.h b/cpp/src/arrow/adapters/orc/adapter.h index 013be78600a8f..4ffff81f355f1 100644 --- a/cpp/src/arrow/adapters/orc/adapter.h +++ b/cpp/src/arrow/adapters/orc/adapter.h @@ -138,7 +138,7 @@ class ARROW_EXPORT ORCFileReader { /// \brief Get a stripe level record batch iterator. /// /// Each record batch will have up to `batch_size` rows. - /// NextStripeReader serves as a fine grained alternative to ReadStripe + /// NextStripeReader serves as a fine-grained alternative to ReadStripe /// which may cause OOM issues by loading the whole stripe into memory. /// /// Note this will only read rows for the current stripe, not the entire @@ -151,7 +151,7 @@ class ARROW_EXPORT ORCFileReader { /// \brief Get a stripe level record batch iterator. /// /// Each record batch will have up to `batch_size` rows. - /// NextStripeReader serves as a fine grained alternative to ReadStripe + /// NextStripeReader serves as a fine-grained alternative to ReadStripe /// which may cause OOM issues by loading the whole stripe into memory. /// /// Note this will only read rows for the current stripe, not the entire @@ -256,7 +256,7 @@ class ARROW_EXPORT ORCFileReader { int64_t GetFileLength(); /// \brief Get the serialized file tail. - /// Usefull if another reader of the same file wants to avoid re-reading + /// Useful if another reader of the same file wants to avoid re-reading /// the file tail. See ReadOptions.SetSerializedFileTail(). /// /// \return a string of bytes with the file tail diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index 326c67f5eceac..58bbc9282c204 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -3131,7 +3131,7 @@ TEST_F(TestArrayImport, RunEndEncodedWithOffset) { REEFromJSON(ree_type, "[-2.0, -2.0, -2.0, -2.0, 3.0, 3.0, 3.0]")); CheckImport(expected); - // Ofsset in parent + // Offset in parent FillPrimitive(AddChild(), 5, 0, 0, run_ends_buffers5); FillPrimitive(AddChild(), 5, 0, 0, primitive_buffers_no_nulls5); FillRunEndEncoded(5, 2); @@ -3383,7 +3383,7 @@ TEST_F(TestArrayImport, ListError) { } TEST_F(TestArrayImport, ListViewNoError) { - // Unlike with lists, importing a length-0 list-view with all buffers ommitted is + // Unlike with lists, importing a length-0 list-view with all buffers omitted is // not an error. List-views don't need an extra offset value, so an empty offsets // buffer is valid in this case. diff --git a/cpp/src/arrow/chunked_array_test.cc b/cpp/src/arrow/chunked_array_test.cc index 46dccaf3c6b86..6ca52ab46ca68 100644 --- a/cpp/src/arrow/chunked_array_test.cc +++ b/cpp/src/arrow/chunked_array_test.cc @@ -228,7 +228,7 @@ TEST_F(TestChunkedArray, Validate) { random::RandomArrayGenerator gen(0); - // Valid if non-empty and ommitted type + // Valid if non-empty and omitted type ArrayVector arrays = {gen.Int64(50, 0, 100, 0.1), gen.Int64(50, 0, 100, 0.1)}; auto chunks_with_no_type = std::make_shared(arrays, nullptr); ASSERT_OK(chunks_with_no_type->ValidateFull()); diff --git a/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc index 4a5e579fb155e..a25b04ae4fa65 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc @@ -171,7 +171,7 @@ class PrimitiveFilterImpl { } if (out_arr->buffers[0] != nullptr) { - // May not be allocated if neither filter nor values contains nulls + // May be unallocated if neither filter nor values contain nulls out_is_valid_ = out_arr->buffers[0]->mutable_data(); } out_data_ = reinterpret_cast(out_arr->buffers[1]->mutable_data()); diff --git a/cpp/src/arrow/csv/lexing_internal.h b/cpp/src/arrow/csv/lexing_internal.h index 357c5716d5115..b1da12750ac58 100644 --- a/cpp/src/arrow/csv/lexing_internal.h +++ b/cpp/src/arrow/csv/lexing_internal.h @@ -71,7 +71,7 @@ class BaseBloomFilter { // For example 'b' (ASCII value 98) will set/test bit #34 in the filter. // If the bit is set in the filter, the given character *may* be part // of the matched characters. If the bit is unset in the filter, - // the the given character *cannot* be part of the matched characters. + // the given character *cannot* be part of the matched characters. FilterType CharFilter(uint8_t c) const { return static_cast(1) << (c & kCharMask); } diff --git a/cpp/src/arrow/csv/reader.cc b/cpp/src/arrow/csv/reader.cc index 30fc0bc6aca44..332fad054fea3 100644 --- a/cpp/src/arrow/csv/reader.cc +++ b/cpp/src/arrow/csv/reader.cc @@ -389,7 +389,7 @@ namespace { // The parsed batch contains a list of offsets for each of the columns so that columns // can be individually scanned // -// This operator is not re-entrant +// This operator is not reentrant class BlockParsingOperator { public: BlockParsingOperator(io::IOContext io_context, ParseOptions parse_options, diff --git a/cpp/src/arrow/csv/writer_benchmark.cc b/cpp/src/arrow/csv/writer_benchmark.cc index 9bbba7ebd7e9f..54c0f50613754 100644 --- a/cpp/src/arrow/csv/writer_benchmark.cc +++ b/cpp/src/arrow/csv/writer_benchmark.cc @@ -109,7 +109,7 @@ void BenchmarkWriteCsv(benchmark::State& state, const WriteOptions& options, state.counters["null_percent"] = static_cast(state.range(0)); } -// Exercies UnQuotedColumnPopulator with integer +// Exercises UnQuotedColumnPopulator with integer void WriteCsvNumeric(benchmark::State& state) { auto batch = MakeIntTestBatch(kCsvRows, kCsvCols, state.range(0)); BenchmarkWriteCsv(state, WriteOptions::Defaults(), *batch); diff --git a/cpp/src/arrow/engine/substrait/extended_expression_internal.cc b/cpp/src/arrow/engine/substrait/extended_expression_internal.cc index a6401e1d0b36d..225901c910f25 100644 --- a/cpp/src/arrow/engine/substrait/extended_expression_internal.cc +++ b/cpp/src/arrow/engine/substrait/extended_expression_internal.cc @@ -85,7 +85,7 @@ Result ExpressionFromProto( // expression which is not redundant. // // For example, if the base schema is [struct, i32] and the expression is - // field(0) the the extended expression output names might be ["foo", "my_expression"]. + // field(0) the extended expression output names might be ["foo", "my_expression"]. // The "foo" is redundant but we can verify it matches and reject if it does not. // // The one exception is struct literals which have no field names. For example, if diff --git a/cpp/src/arrow/engine/substrait/extension_set.h b/cpp/src/arrow/engine/substrait/extension_set.h index d9c0af081a546..0a502960447e6 100644 --- a/cpp/src/arrow/engine/substrait/extension_set.h +++ b/cpp/src/arrow/engine/substrait/extension_set.h @@ -86,7 +86,7 @@ struct ARROW_ENGINE_EXPORT IdHashEq { /// \brief Owning storage for ids /// /// Substrait plans may reuse URIs and names in many places. For convenience -/// and performance Substarit ids are typically passed around as views. As we +/// and performance Substrait ids are typically passed around as views. As we /// convert a plan from Substrait to Arrow we need to copy these strings out of /// the Substrait buffer and into owned storage. This class serves as that owned /// storage. diff --git a/cpp/src/arrow/engine/substrait/options.cc b/cpp/src/arrow/engine/substrait/options.cc index 481375076734f..f8e7173386583 100644 --- a/cpp/src/arrow/engine/substrait/options.cc +++ b/cpp/src/arrow/engine/substrait/options.cc @@ -81,7 +81,7 @@ class DefaultExtensionProvider : public BaseExtensionProvider { rel.UnpackTo(&seg_agg_rel); return MakeSegmentedAggregateRel(conv_opts, inputs, seg_agg_rel, ext_set); } - return Status::NotImplemented("Unrecognized extension in Susbstrait plan: ", + return Status::NotImplemented("Unrecognized extension in Substrait plan: ", rel.DebugString()); } diff --git a/cpp/src/arrow/engine/substrait/serde_test.cc b/cpp/src/arrow/engine/substrait/serde_test.cc index 2e72ae70edd88..1e771ccdd25c2 100644 --- a/cpp/src/arrow/engine/substrait/serde_test.cc +++ b/cpp/src/arrow/engine/substrait/serde_test.cc @@ -1334,7 +1334,7 @@ TEST(Substrait, GetRecordBatchReader) { ASSERT_OK_AND_ASSIGN(auto reader, ExecuteSerializedPlan(*buf)); ASSERT_OK_AND_ASSIGN(auto table, Table::FromRecordBatchReader(reader.get())); // Note: assuming the binary.parquet file contains fixed amount of records - // in case of a test failure, re-evalaute the content in the file + // in case of a test failure, re-evaluate the content in the file EXPECT_EQ(table->num_rows(), 12); }); } @@ -4223,7 +4223,7 @@ TEST(Substrait, ReadRelWithGlobFiles) { } }] })")); - // To avoid unnecessar metadata columns being included in the final result + // To avoid unnecessary metadata columns being included in the final result std::vector include_columns = {0, 1, 2}; compute::SortOptions options({compute::SortKey("A", compute::SortOrder::Ascending)}); CheckRoundTripResult(std::move(expected_table), buf, std::move(include_columns), @@ -6108,7 +6108,7 @@ TEST(Substrait, ExtendedExpressionSerialization) { TEST(Substrait, ExtendedExpressionInvalidPlans) { // The schema defines the type as {"x", "y"} but output_names has {"a", "y"} - constexpr std::string_view kBadOuptutNames = R"( + constexpr std::string_view kBadOutputNames = R"( { "referredExpr":[ { @@ -6159,7 +6159,7 @@ TEST(Substrait, ExtendedExpressionInvalidPlans) { )"; ASSERT_OK_AND_ASSIGN( - auto buf, internal::SubstraitFromJSON("ExtendedExpression", kBadOuptutNames)); + auto buf, internal::SubstraitFromJSON("ExtendedExpression", kBadOutputNames)); ASSERT_THAT(DeserializeExpressions(*buf), Raises(StatusCode::Invalid, testing::HasSubstr("Ambiguous plan"))); diff --git a/cpp/src/arrow/engine/substrait/visibility.h b/cpp/src/arrow/engine/substrait/visibility.h index cfd0db2747bba..d81d202ee6567 100644 --- a/cpp/src/arrow/engine/substrait/visibility.h +++ b/cpp/src/arrow/engine/substrait/visibility.h @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -// TODO(westonpace): Once we have a propert engine module this file +// TODO(westonpace): Once we have a proper engine module this file // should be renamed arrow/engine/visibility.h // This API is EXPERIMENTAL. diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc index b8be1edc49e60..2b8e703d3c66e 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc @@ -194,7 +194,7 @@ TEST_F(TestExtensionType, MetadataSerializationRoundtrip) { "Invalid dim_names"); } -TEST_F(TestExtensionType, RoudtripBatch) { +TEST_F(TestExtensionType, RoundtripBatch) { auto exact_ext_type = internal::checked_pointer_cast(ext_type_); std::vector> buffers = {nullptr, Buffer::Wrap(values_)}; @@ -383,7 +383,7 @@ TEST_F(TestExtensionType, SliceTensor) { ASSERT_EQ(sliced->length(), partial->length()); } -TEST_F(TestExtensionType, RoudtripBatchFromTensor) { +TEST_F(TestExtensionType, RoundtripBatchFromTensor) { auto exact_ext_type = internal::checked_pointer_cast(ext_type_); ASSERT_OK_AND_ASSIGN(auto tensor, Tensor::Make(value_type_, Buffer::Wrap(values_), shape_, {}, {"n", "x", "y"})); diff --git a/cpp/src/arrow/field_ref_test.cc b/cpp/src/arrow/field_ref_test.cc index 10e2564ed1896..0cb2da4f709a1 100644 --- a/cpp/src/arrow/field_ref_test.cc +++ b/cpp/src/arrow/field_ref_test.cc @@ -135,7 +135,7 @@ struct FieldPathTestCase { out.schema = arrow::schema({out.v0.field, out.v1.field}); out.type = struct_(out.schema->fields()); - // Create null bitmaps for the struct fields independent of its childrens' + // Create null bitmaps for the struct fields independent of its children's // bitmaps. For FieldPath::GetFlattened, parent/child bitmaps should be combined // - for FieldPath::Get, higher-level nulls are ignored. auto bitmap1_1 = gen.NullBitmap(kNumRows, 0.15); diff --git a/cpp/src/arrow/integration/json_integration.h b/cpp/src/arrow/integration/json_integration.h index 0284ef6c89d97..13abfae095ab6 100644 --- a/cpp/src/arrow/integration/json_integration.h +++ b/cpp/src/arrow/integration/json_integration.h @@ -40,7 +40,7 @@ class ARROW_EXPORT IntegrationJsonWriter { /// \brief Create a new JSON writer that writes to memory /// /// \param[in] schema the schema of record batches - /// \return the creater writer object + /// \return the creator writer object static Result> Open( const std::shared_ptr& schema); diff --git a/cpp/src/arrow/io/file_benchmark.cc b/cpp/src/arrow/io/file_benchmark.cc index 7fd10a0a0e659..02ccfb6337f4b 100644 --- a/cpp/src/arrow/io/file_benchmark.cc +++ b/cpp/src/arrow/io/file_benchmark.cc @@ -220,7 +220,7 @@ static void BenchmarkStreamingWrites(benchmark::State& state, // Benchmark writing to /dev/null // -// This situation is irrealistic as the kernel likely doesn't +// This situation is unrealistic as the kernel likely doesn't // copy the data at all, so we only measure small writes. static void FileOutputStreamSmallWritesToNull( diff --git a/cpp/src/arrow/io/interfaces.h b/cpp/src/arrow/io/interfaces.h index d2a11b7b6d7ce..b36c38c6d4868 100644 --- a/cpp/src/arrow/io/interfaces.h +++ b/cpp/src/arrow/io/interfaces.h @@ -196,7 +196,7 @@ class ARROW_EXPORT Readable { /// EXPERIMENTAL: The IOContext associated with this file. /// /// By default, this is the same as default_io_context(), but it may be - /// overriden by subclasses. + /// overridden by subclasses. virtual const IOContext& io_context() const; }; diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc index 4f41edf8e15db..4154b594d9507 100644 --- a/cpp/src/arrow/ipc/metadata_internal.cc +++ b/cpp/src/arrow/ipc/metadata_internal.cc @@ -1423,7 +1423,7 @@ Status GetSchema(const void* opaque_schema, DictionaryMemo* dictionary_memo, std::shared_ptr metadata; RETURN_NOT_OK(internal::GetKeyValueMetadata(schema->custom_metadata(), &metadata)); - // set endianess using the value in flatbuf schema + // set endianness using the value in flatbuf schema auto endianness = schema->endianness() == flatbuf::Endianness::Little ? Endianness::Little : Endianness::Big; diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc index 5c15cb912e4a7..17c4c5636d5b0 100644 --- a/cpp/src/arrow/ipc/read_write_test.cc +++ b/cpp/src/arrow/ipc/read_write_test.cc @@ -140,7 +140,7 @@ TEST_P(TestMessage, SerializeTo) { output_length); ASSERT_OK_AND_EQ(output_length, stream->Tell()); ASSERT_OK_AND_ASSIGN(auto buffer, stream->Finish()); - // chech whether length is written in little endian + // check whether length is written in little endian auto buffer_ptr = buffer.get()->data(); ASSERT_EQ(output_length - body_length - prefix_size, bit_util::FromLittleEndian(*(uint32_t*)(buffer_ptr + 4))); @@ -363,7 +363,7 @@ TEST_F(TestSchemaMetadata, MetadataVersionForwardCompatibility) { std::string root; ASSERT_OK(GetTestResourceRoot(&root)); - // schema_v6.arrow with currently non-existent MetadataVersion::V6 + // schema_v6.arrow with currently nonexistent MetadataVersion::V6 std::stringstream schema_v6_path; schema_v6_path << root << "/forward-compatibility/schema_v6.arrow"; @@ -520,7 +520,7 @@ class IpcTestFixture : public io::MemoryMapFixture, public ExtensionTypesMixin { }; TEST(MetadataVersion, ForwardsCompatCheck) { - // Verify UBSAN is ok with casting out of range metdata version. + // Verify UBSAN is ok with casting out of range metadata version. EXPECT_LT(flatbuf::MetadataVersion::MAX, static_cast(72)); } @@ -3019,14 +3019,14 @@ TEST(TestRecordBatchFileReaderIo, SkipTheFieldInTheMiddle) { GetReadRecordBatchReadRanges({0, 2}, {1, 40}); } -TEST(TestRecordBatchFileReaderIo, ReadTwoContinousFields) { +TEST(TestRecordBatchFileReaderIo, ReadTwoContinuousFields) { // read the int32 field and the int64 field // + 5 int32: 5 * 4 bytes // + 5 int64: 5 * 8 bytes GetReadRecordBatchReadRanges({1, 2}, {20, 40}); } -TEST(TestRecordBatchFileReaderIo, ReadTwoContinousFieldsWithIoMerged) { +TEST(TestRecordBatchFileReaderIo, ReadTwoContinuousFieldsWithIoMerged) { // change the array length to 64 so that bool field and int32 are continuous without // padding // read the bool field and the int32 field since the bool field's aligned offset diff --git a/cpp/src/arrow/ipc/reader.h b/cpp/src/arrow/ipc/reader.h index de4606094049c..888f59a627771 100644 --- a/cpp/src/arrow/ipc/reader.h +++ b/cpp/src/arrow/ipc/reader.h @@ -258,7 +258,7 @@ class ARROW_EXPORT Listener { virtual Status OnEOS(); /// \brief Called when a record batch is decoded and - /// OnRecordBatchWithMetadataDecoded() isn't overrided. + /// OnRecordBatchWithMetadataDecoded() isn't overridden. /// /// The default implementation just returns /// arrow::Status::NotImplemented(). diff --git a/cpp/src/arrow/json/converter_test.cc b/cpp/src/arrow/json/converter_test.cc index cfc44c99976d5..fa85e704bc5e3 100644 --- a/cpp/src/arrow/json/converter_test.cc +++ b/cpp/src/arrow/json/converter_test.cc @@ -39,7 +39,7 @@ Result> Convert(std::shared_ptr type, return converted; } -// bool, null are trivial pass throughs +// bool, null are trivial pass-throughs TEST(ConverterTest, Integers) { for (auto int_type : {int8(), int16(), int32(), int64()}) { diff --git a/cpp/src/arrow/json/reader.h b/cpp/src/arrow/json/reader.h index 7776cb0b7d8a0..b7849a83ba1f8 100644 --- a/cpp/src/arrow/json/reader.h +++ b/cpp/src/arrow/json/reader.h @@ -79,7 +79,7 @@ class ARROW_EXPORT StreamingReader : public RecordBatchReader { /// threading is disabled, this will block until completion. virtual Future> ReadNextAsync() = 0; - /// Get the number of bytes which have been succesfully converted to record batches + /// Get the number of bytes which have been successfully converted to record batches /// and consumed [[nodiscard]] virtual int64_t bytes_processed() const = 0; diff --git a/cpp/src/arrow/table_test.cc b/cpp/src/arrow/table_test.cc index 3949caa402846..5f6905ce672d2 100644 --- a/cpp/src/arrow/table_test.cc +++ b/cpp/src/arrow/table_test.cc @@ -179,7 +179,7 @@ TEST_F(TestTable, Equals) { other = Table::Make(schema_, other_columns); ASSERT_FALSE(table_->Equals(*other)); - // Differring schema metadata + // Differing schema metadata other_schema = schema_->WithMetadata(::arrow::key_value_metadata({"key"}, {"value"})); other = Table::Make(other_schema, columns_); ASSERT_TRUE(table_->Equals(*other)); @@ -635,8 +635,8 @@ TEST_F(TestTable, SelectColumns) { ASSERT_OK_AND_ASSIGN(auto subset, table->SelectColumns({0, 2})); ASSERT_OK(subset->ValidateFull()); - auto expexted_schema = ::arrow::schema({schema_->field(0), schema_->field(2)}); - auto expected = Table::Make(expexted_schema, {table->column(0), table->column(2)}); + auto expected_schema = ::arrow::schema({schema_->field(0), schema_->field(2)}); + auto expected = Table::Make(expected_schema, {table->column(0), table->column(2)}); ASSERT_TRUE(subset->Equals(*expected)); // Out of bounds indices diff --git a/cpp/src/arrow/testing/util.cc b/cpp/src/arrow/testing/util.cc index e8a782575e278..36351fa8595be 100644 --- a/cpp/src/arrow/testing/util.cc +++ b/cpp/src/arrow/testing/util.cc @@ -198,7 +198,7 @@ std::string GetListenAddress() { ss << "." << byte; } #else - // On MacOS, only 127.0.0.1 is a valid loopback address by default. + // On macOS, only 127.0.0.1 is a valid loopback address by default. ss << "127.0.0.1"; #endif // Append port number diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index 009e557f82f68..22913f77fbfc1 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -612,7 +612,7 @@ TEST_F(TestSchema, TestMetadataConstruction) { AssertSchemaEqual(schema2, schema1); AssertSchemaNotEqual(schema2, schema1, /*check_metadata=*/true); - // Field has different metatadata + // Field has different metadata AssertSchemaEqual(schema2, schema3); AssertSchemaNotEqual(schema2, schema3, /*check_metadata=*/true); diff --git a/cpp/src/generated/Schema_generated.h b/cpp/src/generated/Schema_generated.h index eeeeac68f0a45..12ee81e6743b5 100644 --- a/cpp/src/generated/Schema_generated.h +++ b/cpp/src/generated/Schema_generated.h @@ -1725,7 +1725,7 @@ inline ::flatbuffers::Offset + **/module-info.java dev/checkstyle/checkstyle.xml dev/checkstyle/checkstyle.license dev/checkstyle/suppressions.xml @@ -371,6 +372,24 @@ + + org.apache.arrow.maven.plugins + module-info-compiler-maven-plugin + + + default-compile + + compile + + + + default-testCompile + + testCompile + + + + @@ -400,6 +419,8 @@ maven-compiler-plugin ${maven-compiler-plugin.version} + **/module-info.java + **/module-info.java false @@ -546,6 +567,11 @@ + + org.apache.arrow.maven.plugins + module-info-compiler-maven-plugin + ${project.version} + @@ -735,6 +761,7 @@ + maven bom format memory @@ -1236,7 +1263,6 @@ - From 4aa9f604dfdab4c4b524a5b18c7976adb10c9b41 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 12 Dec 2023 17:07:56 -0400 Subject: [PATCH 038/570] GH-39185: [C++] Remove compiler warnings with `-Wconversion -Wno-sign-conversion` in public headers (#39186) ### Rationale for this change The R package has a warning from CRAN to fix a failure to compile with `-Wconversion -Wno-sign-conversion -Werror`. Some of these errors we control and can patch easily; however, the ones in the Arrow C++ portion are more difficult to work around (hence the separate PR). See #39138 for all reported errors (including those in just the R package). ### What changes are included in this PR? The requisite `static_cast<>()`s were added to silence the warnings. ### Are these changes tested? By existing tests. We may add a future R nightly job that runs with these warning flags. ### Are there any user-facing changes? No * Closes: #39185 Authored-by: Dewey Dunnington Signed-off-by: Dewey Dunnington --- cpp/src/arrow/util/bit_util.h | 9 ++++++--- cpp/src/arrow/util/bitmap_generate.h | 7 ++++--- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h index 04ab07af1d779..1d3a1dc2459f9 100644 --- a/cpp/src/arrow/util/bit_util.h +++ b/cpp/src/arrow/util/bit_util.h @@ -335,7 +335,9 @@ void ClearBitmap(uint8_t* data, int64_t offset, int64_t length); /// ref: https://stackoverflow.com/a/59523400 template constexpr Word PrecedingWordBitmask(unsigned int const i) { - return (static_cast(i < sizeof(Word) * 8) << (i & (sizeof(Word) * 8 - 1))) - 1; + return static_cast(static_cast(i < sizeof(Word) * 8) + << (i & (sizeof(Word) * 8 - 1))) - + 1; } static_assert(PrecedingWordBitmask(0) == 0x00, ""); static_assert(PrecedingWordBitmask(4) == 0x0f, ""); @@ -357,8 +359,9 @@ constexpr Word SpliceWord(int n, Word low, Word high) { template void PackBits(const uint32_t* values, uint8_t* out) { for (int i = 0; i < batch_size / 8; ++i) { - *out++ = (values[0] | values[1] << 1 | values[2] << 2 | values[3] << 3 | - values[4] << 4 | values[5] << 5 | values[6] << 6 | values[7] << 7); + *out++ = static_cast(values[0] | values[1] << 1 | values[2] << 2 | + values[3] << 3 | values[4] << 4 | values[5] << 5 | + values[6] << 6 | values[7] << 7); values += 8; } } diff --git a/cpp/src/arrow/util/bitmap_generate.h b/cpp/src/arrow/util/bitmap_generate.h index 5efc5d5a1d501..52a1e228e01f1 100644 --- a/cpp/src/arrow/util/bitmap_generate.h +++ b/cpp/src/arrow/util/bitmap_generate.h @@ -90,9 +90,10 @@ void GenerateBitsUnrolled(uint8_t* bitmap, int64_t start_offset, int64_t length, for (int i = 0; i < 8; ++i) { out_results[i] = g(); } - *cur++ = (out_results[0] | out_results[1] << 1 | out_results[2] << 2 | - out_results[3] << 3 | out_results[4] << 4 | out_results[5] << 5 | - out_results[6] << 6 | out_results[7] << 7); + *cur++ = static_cast(out_results[0] | out_results[1] << 1 | + out_results[2] << 2 | out_results[3] << 3 | + out_results[4] << 4 | out_results[5] << 5 | + out_results[6] << 6 | out_results[7] << 7); } int64_t remaining_bits = remaining % 8; From d2209582a0ef81c93342183cab3c12d69e79c5be Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Wed, 13 Dec 2023 10:15:22 -0500 Subject: [PATCH 039/570] MINOR: [JS] Fix typo in unmemoize comment (#39084) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Authored-by: Dominik Moritz Signed-off-by: Raúl Cumplido --- js/src/vector.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/js/src/vector.ts b/js/src/vector.ts index 8c9a3da66c92c..7e1caa343562c 100644 --- a/js/src/vector.ts +++ b/js/src/vector.ts @@ -324,7 +324,7 @@ export class Vector { * Returns a vector without memoization of the {@link get} method. If this * vector is not memoized, this method returns this vector. * - * @returns A a vector without memoization. + * @returns A new vector without memoization. */ public unmemoize(): Vector { if (DataType.isDictionary(this.type) && this.isMemoized) { From b5a46572cb6446d1c08ca1e111733b4861e8ddca Mon Sep 17 00:00:00 2001 From: mwish Date: Thu, 14 Dec 2023 01:28:02 +0800 Subject: [PATCH 040/570] GH-39208: [C++][Parquet] Remove deprecated AppendRowGroup(int64_t num_rows) (#39209) ### Rationale for this change Described in issue ### What changes are included in this PR? Remove the function below: ``` /// \note Deprecated since 1.3.0 RowGroupWriter* AppendRowGroup(int64_t num_rows); ``` ### Are these changes tested? no ### Are there any user-facing changes? no * Closes: #39208 Authored-by: mwish Signed-off-by: Antoine Pitrou --- cpp/src/parquet/file_writer.cc | 4 ---- cpp/src/parquet/file_writer.h | 12 ------------ 2 files changed, 16 deletions(-) diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 5502e1f94a9d0..6f5610b934d81 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -642,10 +642,6 @@ RowGroupWriter* ParquetFileWriter::AppendBufferedRowGroup() { return contents_->AppendBufferedRowGroup(); } -RowGroupWriter* ParquetFileWriter::AppendRowGroup(int64_t num_rows) { - return AppendRowGroup(); -} - void ParquetFileWriter::AddKeyValueMetadata( const std::shared_ptr& key_value_metadata) { if (contents_) { diff --git a/cpp/src/parquet/file_writer.h b/cpp/src/parquet/file_writer.h index 3bda1e535cfa6..31706af86dbde 100644 --- a/cpp/src/parquet/file_writer.h +++ b/cpp/src/parquet/file_writer.h @@ -147,9 +147,6 @@ class PARQUET_EXPORT ParquetFileWriter { // Perform any cleanup associated with the file contents virtual void Close() = 0; - /// \note Deprecated since 1.3.0 - RowGroupWriter* AppendRowGroup(int64_t num_rows); - virtual RowGroupWriter* AppendRowGroup() = 0; virtual RowGroupWriter* AppendBufferedRowGroup() = 0; @@ -190,15 +187,6 @@ class PARQUET_EXPORT ParquetFileWriter { void Open(std::unique_ptr contents); void Close(); - // Construct a RowGroupWriter for the indicated number of rows. - // - // Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid - // until the next call to AppendRowGroup or AppendBufferedRowGroup or Close. - // @param num_rows The number of rows that are stored in the new RowGroup - // - // \deprecated Since 1.3.0 - RowGroupWriter* AppendRowGroup(int64_t num_rows); - /// Construct a RowGroupWriter with an arbitrary number of rows. /// /// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid From 4142607f61a2e52fddaaee6e82a9e1be1d462cd9 Mon Sep 17 00:00:00 2001 From: mwish Date: Thu, 14 Dec 2023 01:28:21 +0800 Subject: [PATCH 041/570] GH-39210: [C++][Parquet] Avoid WriteRecordBatch from produce zero-sized RowGroup (#39211) ### Rationale for this change `WriteRecordBatch` might produce zero-sized row-group, which is mentioned in https://github.com/apache/arrow/issues/39210 . This patch avoid WriteRecordBatch from produce zero-sized RowGroup. ### What changes are included in this PR? adding a check for zero-sized row-group ### Are these changes tested? Yes ### Are there any user-facing changes? no * Closes: #39210 Lead-authored-by: mwish Co-authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- .../parquet/arrow/arrow_reader_writer_test.cc | 41 +++++++++++++++++++ cpp/src/parquet/arrow/writer.cc | 6 ++- 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index a2f3498190f93..dd0b19c2ce048 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -5224,6 +5224,47 @@ TEST(TestArrowReadWrite, WriteAndReadRecordBatch) { EXPECT_TRUE(record_batch->Equals(*read_record_batch)); } +TEST(TestArrowReadWrite, WriteRecordBatchNotProduceEmptyRowGroup) { + // GH-39211: WriteRecordBatch should prevent from writing a empty row group + // in the end of the file. + auto pool = ::arrow::default_memory_pool(); + auto sink = CreateOutputStream(); + // Limit the max number of rows in a row group to 2 + auto writer_properties = WriterProperties::Builder().max_row_group_length(2)->build(); + auto arrow_writer_properties = default_arrow_writer_properties(); + + // Prepare schema + auto schema = ::arrow::schema({::arrow::field("a", ::arrow::int64())}); + std::shared_ptr parquet_schema; + ASSERT_OK_NO_THROW(ToParquetSchema(schema.get(), *writer_properties, + *arrow_writer_properties, &parquet_schema)); + auto schema_node = std::static_pointer_cast(parquet_schema->schema_root()); + + auto gen = ::arrow::random::RandomArrayGenerator(/*seed=*/42); + + // Create writer to write data via RecordBatch. + auto writer = ParquetFileWriter::Open(sink, schema_node, writer_properties); + std::unique_ptr arrow_writer; + ASSERT_OK(FileWriter::Make(pool, std::move(writer), schema, arrow_writer_properties, + &arrow_writer)); + // NewBufferedRowGroup() is not called explicitly and it will be called + // inside WriteRecordBatch(). + // Write 20 rows for two times + for (int i = 0; i < 2; ++i) { + auto record_batch = + gen.BatchOf({::arrow::field("a", ::arrow::int64())}, /*length=*/20); + ASSERT_OK_NO_THROW(arrow_writer->WriteRecordBatch(*record_batch)); + } + ASSERT_OK_NO_THROW(arrow_writer->Close()); + ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish()); + + auto file_metadata = arrow_writer->metadata(); + EXPECT_EQ(20, file_metadata->num_row_groups()); + for (int i = 0; i < 20; ++i) { + EXPECT_EQ(2, file_metadata->RowGroup(i)->num_rows()); + } +} + TEST(TestArrowReadWrite, MultithreadedWrite) { const int num_columns = 20; const int num_rows = 1000; diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index 07c627d5eda67..5238986c428d3 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -419,6 +419,7 @@ class FileWriterImpl : public FileWriter { // Max number of rows allowed in a row group. const int64_t max_row_group_length = this->properties().max_row_group_length(); + // Initialize a new buffered row group writer if necessary. if (row_group_writer_ == nullptr || !row_group_writer_->buffered() || row_group_writer_->num_rows() >= max_row_group_length) { RETURN_NOT_OK(NewBufferedRowGroup()); @@ -461,8 +462,9 @@ class FileWriterImpl : public FileWriter { RETURN_NOT_OK(WriteBatch(offset, batch_size)); offset += batch_size; - // Flush current row group if it is full. - if (row_group_writer_->num_rows() >= max_row_group_length) { + // Flush current row group writer and create a new writer if it is full. + if (row_group_writer_->num_rows() >= max_row_group_length && + offset < batch.num_rows()) { RETURN_NOT_OK(NewBufferedRowGroup()); } } From dbed728f840bdb84880708dda865ba4c985e95f9 Mon Sep 17 00:00:00 2001 From: Dane Pitkin <48041712+danepitkin@users.noreply.github.com> Date: Wed, 13 Dec 2023 14:31:51 -0500 Subject: [PATCH 042/570] GH-39189: [Java] Bump com.h2database:h2 from 1.4.196 to 2.2.224 in /java (#39188) ### Rationale for this change Dependabot flagged this upgrade, but it requires test code changes. H2 is an in-memory database used for JDBC testing and 2.0 had several backwards-breaking changes: https://h2database.com/html/migration-to-v2.html ### What changes are included in this PR? * h2database upgraded from 1.4.196 -> 2.2.224 * H2 changed VARCHAR description from `VARCHAR` to `CHARACTER VARYING` * To query all tables/columns in H2, use `null` values for catalog and schema parameters instead of `%` * H2 now returns Binary and Blob data as a byte array instead of hex values * H2 added the type `VARBINARY`. `Binary` must now be a fixed length and is padded with zeroes. * H2 `CHAR` is fixed length and pads with whitespace now * H2 enforces all `ARRAY`s must be typed * H2 changed the literal syntax for arrays to be `ARRAY[val1, val2, ...]` from `(val1, val2, ...)` * H2 handles unicode chars natively now * H2 connections' `createArrayOf` API handles null values differently now ### Are these changes tested? Unit tests. ### Are there any user-facing changes? No, only tests updated. * Closes: #39189 Authored-by: Dane Pitkin Signed-off-by: David Li --- java/adapter/jdbc/pom.xml | 2 +- .../jdbc/JdbcToArrowCommentMetadataTest.java | 6 ++--- .../adapter/jdbc/JdbcToArrowTestHelper.java | 12 +--------- .../org/apache/arrow/adapter/jdbc/Table.java | 23 ++---------------- .../adapter/jdbc/h2/JdbcToArrowArrayTest.java | 21 ++++++++++------ .../resources/h2/test1_all_datatypes_h2.yml | 24 +++++++++---------- .../h2/test1_all_datatypes_null_h2.yml | 4 ++-- ...t1_all_datatypes_selected_null_rows_h2.yml | 8 +++---- .../src/test/resources/h2/test1_binary_h2.yml | 2 +- .../src/test/resources/h2/test1_char_h2.yml | 2 +- .../test/resources/h2/test1_charset_ch_h2.yml | 2 +- .../test/resources/h2/test1_charset_h2.yml | 2 +- .../test/resources/h2/test1_charset_jp_h2.yml | 2 +- .../test/resources/h2/test1_charset_kr_h2.yml | 2 +- .../src/test/resources/h2/test1_list_h2.yml | 22 ++++++++--------- .../h2/test1_selected_datatypes_null_h2.yml | 4 ++-- java/performance/pom.xml | 2 +- 17 files changed, 59 insertions(+), 81 deletions(-) diff --git a/java/adapter/jdbc/pom.xml b/java/adapter/jdbc/pom.xml index 2490f708e6f24..f95956d1f61d5 100644 --- a/java/adapter/jdbc/pom.xml +++ b/java/adapter/jdbc/pom.xml @@ -51,7 +51,7 @@ com.h2database h2 - 1.4.196 + 2.2.224 test diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/JdbcToArrowCommentMetadataTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/JdbcToArrowCommentMetadataTest.java index dc52210d6c7ab..07cab0d829fed 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/JdbcToArrowCommentMetadataTest.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/JdbcToArrowCommentMetadataTest.java @@ -127,7 +127,7 @@ public void schemaCommentWithDatabaseMetadata() throws Exception { "SQL_SCHEMA_NAME", "PUBLIC", "SQL_TABLE_NAME", "TABLE1", "SQL_COLUMN_NAME", "NAME", - "SQL_TYPE", "VARCHAR", + "SQL_TYPE", "CHARACTER VARYING", "comment", "Name of record")), field("COLUMN1", true, Types.MinorType.BIT.getType(), metadata( @@ -205,7 +205,7 @@ private String getTableComment(DatabaseMetaData metaData, String tableName) thro } String comment = null; int rowCount = 0; - try (ResultSet tableMetadata = metaData.getTables("%", "%", tableName, null)) { + try (ResultSet tableMetadata = metaData.getTables(null, null, tableName, null)) { if (tableMetadata.next()) { comment = tableMetadata.getString("REMARKS"); rowCount++; @@ -221,7 +221,7 @@ private String getTableComment(DatabaseMetaData metaData, String tableName) thro } private String getColumnComment(DatabaseMetaData metaData, String tableName, String columnName) throws SQLException { - try (ResultSet tableMetadata = metaData.getColumns("%", "%", tableName, columnName)) { + try (ResultSet tableMetadata = metaData.getColumns(null, null, tableName, columnName)) { if (tableMetadata.next()) { return tableMetadata.getString("REMARKS"); } diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/JdbcToArrowTestHelper.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/JdbcToArrowTestHelper.java index d5f896ba7df56..91f2f465dd989 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/JdbcToArrowTestHelper.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/JdbcToArrowTestHelper.java @@ -334,16 +334,6 @@ public static void assertFieldMetadataMatchesResultSetMetadata(ResultSetMetaData } } - public static byte[] hexStringToByteArray(String s) { - int len = s.length(); - byte[] data = new byte[len / 2]; - for (int i = 0; i < len; i += 2) { - data[i / 2] = (byte) ((Character.digit(s.charAt(i), 16) << 4) + - Character.digit(s.charAt(i + 1), 16)); - } - return data; - } - public static Integer[] getIntValues(String[] values, String dataType) { String[] dataArr = getValues(values, dataType); Integer[] valueArr = new Integer[dataArr.length]; @@ -429,7 +419,7 @@ public static byte[][] getBinaryValues(String[] values, String dataType) { byte[][] valueArr = new byte[dataArr.length][]; int i = 0; for (String data : dataArr) { - valueArr[i++] = "null".equals(data.trim()) ? null : hexStringToByteArray(data.trim()); + valueArr[i++] = "null".equals(data.trim()) ? null : data.trim().getBytes(); } return valueArr; } diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/Table.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/Table.java index 87d5765b5d4b4..50c4fe6db2a14 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/Table.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/Table.java @@ -129,7 +129,7 @@ public Float[] getFloatValues() { } public byte[][] getBinaryValues() { - return getHexToByteArray(values); + return getByteArray(values); } public byte[][] getVarCharValues() { @@ -137,7 +137,7 @@ public byte[][] getVarCharValues() { } public byte[][] getBlobValues() { - return getBinaryValues(); + return getByteArray(values); } public byte[][] getClobValues() { @@ -221,23 +221,4 @@ static byte[][] getByteArray(String[] data) { } return byteArr; } - - static byte[][] getHexToByteArray(String[] data) { - byte[][] byteArr = new byte[data.length][]; - - for (int i = 0; i < data.length; i++) { - byteArr[i] = hexStringToByteArray(data[i]); - } - return byteArr; - } - - static byte[] hexStringToByteArray(String s) { - int len = s.length(); - byte[] data = new byte[len / 2]; - for (int i = 0; i < len; i += 2) { - data[i / 2] = (byte) ((Character.digit(s.charAt(i), 16) << 4) + - Character.digit(s.charAt(i + 1), 16)); - } - return data; - } } diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowArrayTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowArrayTest.java index b7dc1ee58a5ba..377e332b43a13 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowArrayTest.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowArrayTest.java @@ -53,7 +53,8 @@ public class JdbcToArrowArrayTest { private Connection conn = null; private static final String CREATE_STATEMENT = - "CREATE TABLE array_table (id INTEGER, int_array ARRAY, float_array ARRAY, string_array ARRAY);"; + "CREATE TABLE array_table (id INTEGER, int_array INTEGER ARRAY, float_array REAL ARRAY, " + + "string_array VARCHAR ARRAY);"; private static final String INSERT_STATEMENT = "INSERT INTO array_table (id, int_array, float_array, string_array) VALUES (?, ?, ?, ?);"; private static final String QUERY = "SELECT int_array, float_array, string_array FROM array_table ORDER BY id;"; @@ -354,9 +355,9 @@ private void insertRows( Float[] floatArray = floatArrays[i]; String[] strArray = strArrays[i]; - Array intArray = conn.createArrayOf("INT", integerArray); - Array realArray = conn.createArrayOf("REAL", floatArray); - Array varcharArray = conn.createArrayOf("VARCHAR", strArray); + Array intArray = integerArray != null ? conn.createArrayOf("INT", integerArray) : null; + Array realArray = floatArray != null ? conn.createArrayOf("REAL", floatArray) : null; + Array varcharArray = strArray != null ? conn.createArrayOf("VARCHAR", strArray) : null; // Insert Arrays of 4 Values in Each Row stmt.setInt(1, i); @@ -366,9 +367,15 @@ private void insertRows( stmt.executeUpdate(); - intArray.free(); - realArray.free(); - varcharArray.free(); + if (intArray != null) { + intArray.free(); + } + if (realArray != null) { + realArray.free(); + } + if (varcharArray != null) { + varcharArray.free(); + } } } } diff --git a/java/adapter/jdbc/src/test/resources/h2/test1_all_datatypes_h2.yml b/java/adapter/jdbc/src/test/resources/h2/test1_all_datatypes_h2.yml index ff76acf8d7cfb..c4f0017095df0 100644 --- a/java/adapter/jdbc/src/test/resources/h2/test1_all_datatypes_h2.yml +++ b/java/adapter/jdbc/src/test/resources/h2/test1_all_datatypes_h2.yml @@ -13,59 +13,59 @@ name: 'test1_all_datatypes_h2' create: 'CREATE TABLE table1 (int_field1 INT, bool_field2 BOOLEAN, tinyint_field3 TINYINT, smallint_field4 SMALLINT, bigint_field5 BIGINT, decimal_field6 DECIMAL(20,2), double_field7 DOUBLE, real_field8 REAL, time_field9 TIME, date_field10 DATE, timestamp_field11 TIMESTAMP, - binary_field12 BINARY(100), varchar_field13 VARCHAR(256), blob_field14 BLOB, clob_field15 CLOB, char_field16 CHAR(16), bit_field17 BIT, - null_field18 NULL, list_field19 ARRAY, map_field20 VARCHAR(256));' + binary_field12 VARBINARY(100), varchar_field13 VARCHAR(256), blob_field14 BLOB, clob_field15 CLOB, char_field16 CHAR(14), bit_field17 BIT, + null_field18 NULL, list_field19 INT ARRAY, map_field20 VARCHAR(256));' data: - 'INSERT INTO table1 VALUES (101, 1, 45, 12000, 92233720, 17345667789.23, 56478356785.345, 56478356785.345, PARSEDATETIME(''12:45:35 GMT'', ''HH:mm:ss z''), PARSEDATETIME(''2018-02-12 GMT'', ''yyyy-MM-dd z''), PARSEDATETIME(''2018-02-12 12:45:35 GMT'', ''yyyy-MM-dd HH:mm:ss z''), ''736f6d6520746578742074686174206e6565647320746f20626520636f6e76657274656420746f2062696e617279'', ''some text that needs to be converted to varchar'', - ''736f6d6520746578742074686174206e6565647320746f20626520636f6e76657274656420746f2062696e617279'', ''some text that needs to be converted to clob'', ''some char text'', 1, null, (1, 2, 3), ''{"a":"b","key":"12345"}'');' + ''736f6d6520746578742074686174206e6565647320746f20626520636f6e76657274656420746f2062696e617279'', ''some text that needs to be converted to clob'', ''some char text'', 1, null, ARRAY[1, 2, 3], ''{"a":"b","key":"12345"}'');' - 'INSERT INTO table1 VALUES (102, 1, 45, 12000, 92233720, 17345667789.23, 56478356785.345, 56478356785.345, PARSEDATETIME(''12:45:35 GMT'', ''HH:mm:ss z''), PARSEDATETIME(''2018-02-12 GMT'', ''yyyy-MM-dd z''), PARSEDATETIME(''2018-02-12 12:45:35 GMT'', ''yyyy-MM-dd HH:mm:ss z''), ''736f6d6520746578742074686174206e6565647320746f20626520636f6e76657274656420746f2062696e617279'', ''some text that needs to be converted to varchar'', - ''736f6d6520746578742074686174206e6565647320746f20626520636f6e76657274656420746f2062696e617279'', ''some text that needs to be converted to clob'', ''some char text'', 1, null, (1, 2),''{"c":"d"}'');' + ''736f6d6520746578742074686174206e6565647320746f20626520636f6e76657274656420746f2062696e617279'', ''some text that needs to be converted to clob'', ''some char text'', 1, null, ARRAY[1, 2],''{"c":"d"}'');' - 'INSERT INTO table1 VALUES (103, 1, 45, 12000, 92233720, 17345667789.23, 56478356785.345, 56478356785.345, PARSEDATETIME(''12:45:35 GMT'', ''HH:mm:ss z''), PARSEDATETIME(''2018-02-12 GMT'', ''yyyy-MM-dd z''), PARSEDATETIME(''2018-02-12 12:45:35 GMT'', ''yyyy-MM-dd HH:mm:ss z''), ''736f6d6520746578742074686174206e6565647320746f20626520636f6e76657274656420746f2062696e617279'', ''some text that needs to be converted to varchar'', - ''736f6d6520746578742074686174206e6565647320746f20626520636f6e76657274656420746f2062696e617279'', ''some text that needs to be converted to clob'', ''some char text'', 1, null, (1),''{"e":"f"}'');' + ''736f6d6520746578742074686174206e6565647320746f20626520636f6e76657274656420746f2062696e617279'', ''some text that needs to be converted to clob'', ''some char text'', 1, null, ARRAY[1],''{"e":"f"}'');' - 'INSERT INTO table1 VALUES (104, 1, 45, 12000, 92233720, 17345667789.23, 56478356785.345, 56478356785.345, PARSEDATETIME(''12:45:35 GMT'', ''HH:mm:ss z''), PARSEDATETIME(''2018-02-12 GMT'', ''yyyy-MM-dd z''), PARSEDATETIME(''2018-02-12 12:45:35 GMT'', ''yyyy-MM-dd HH:mm:ss z''), ''736f6d6520746578742074686174206e6565647320746f20626520636f6e76657274656420746f2062696e617279'', ''some text that needs to be converted to varchar'', - ''736f6d6520746578742074686174206e6565647320746f20626520636f6e76657274656420746f2062696e617279'', ''some text that needs to be converted to clob'', ''some char text'', 1, null, (2, 3, 4),''{"g":"h"}'');' + ''736f6d6520746578742074686174206e6565647320746f20626520636f6e76657274656420746f2062696e617279'', ''some text that needs to be converted to clob'', ''some char text'', 1, null, ARRAY[2, 3, 4],''{"g":"h"}'');' - 'INSERT INTO table1 VALUES (null, 1, 45, 12000, 92233720, 17345667789.23, 56478356785.345, 56478356785.345, PARSEDATETIME(''12:45:35 GMT'', ''HH:mm:ss z''), PARSEDATETIME(''2018-02-12 GMT'', ''yyyy-MM-dd z''), PARSEDATETIME(''2018-02-12 12:45:35 GMT'', ''yyyy-MM-dd HH:mm:ss z''), ''736f6d6520746578742074686174206e6565647320746f20626520636f6e76657274656420746f2062696e617279'', ''some text that needs to be converted to varchar'', - ''736f6d6520746578742074686174206e6565647320746f20626520636f6e76657274656420746f2062696e617279'', ''some text that needs to be converted to clob'', ''some char text'', 1, null, (2, 3),''{"i":"j"}'');' + ''736f6d6520746578742074686174206e6565647320746f20626520636f6e76657274656420746f2062696e617279'', ''some text that needs to be converted to clob'', ''some char text'', 1, null, ARRAY[2, 3],''{"i":"j"}'');' - 'INSERT INTO table1 VALUES (null, 1, 45, 12000, 92233720, 17345667789.23, 56478356785.345, 56478356785.345, PARSEDATETIME(''12:45:35 GMT'', ''HH:mm:ss z''), PARSEDATETIME(''2018-02-12 GMT'', ''yyyy-MM-dd z''), PARSEDATETIME(''2018-02-12 12:45:35 GMT'', ''yyyy-MM-dd HH:mm:ss z''), ''736f6d6520746578742074686174206e6565647320746f20626520636f6e76657274656420746f2062696e617279'', ''some text that needs to be converted to varchar'', - ''736f6d6520746578742074686174206e6565647320746f20626520636f6e76657274656420746f2062696e617279'', ''some text that needs to be converted to clob'', ''some char text'', 1, null, (2),''{"k":"l"}'');' + ''736f6d6520746578742074686174206e6565647320746f20626520636f6e76657274656420746f2062696e617279'', ''some text that needs to be converted to clob'', ''some char text'', 1, null, ARRAY[2],''{"k":"l"}'');' - 'INSERT INTO table1 VALUES (107, 1, 45, 12000, 92233720, 17345667789.23, 56478356785.345, 56478356785.345, PARSEDATETIME(''12:45:35 GMT'', ''HH:mm:ss z''), PARSEDATETIME(''2018-02-12 GMT'', ''yyyy-MM-dd z''), PARSEDATETIME(''2018-02-12 12:45:35 GMT'', ''yyyy-MM-dd HH:mm:ss z''), ''736f6d6520746578742074686174206e6565647320746f20626520636f6e76657274656420746f2062696e617279'', ''some text that needs to be converted to varchar'', - ''736f6d6520746578742074686174206e6565647320746f20626520636f6e76657274656420746f2062696e617279'', ''some text that needs to be converted to clob'', ''some char text'', 1, null, (3, 4, 5),''{"m":"n"}'');' + ''736f6d6520746578742074686174206e6565647320746f20626520636f6e76657274656420746f2062696e617279'', ''some text that needs to be converted to clob'', ''some char text'', 1, null, ARRAY[3, 4, 5],''{"m":"n"}'');' - 'INSERT INTO table1 VALUES (108, 1, 45, 12000, 92233720, 17345667789.23, 56478356785.345, 56478356785.345, PARSEDATETIME(''12:45:35 GMT'', ''HH:mm:ss z''), PARSEDATETIME(''2018-02-12 GMT'', ''yyyy-MM-dd z''), PARSEDATETIME(''2018-02-12 12:45:35 GMT'', ''yyyy-MM-dd HH:mm:ss z''), ''736f6d6520746578742074686174206e6565647320746f20626520636f6e76657274656420746f2062696e617279'', ''some text that needs to be converted to varchar'', - ''736f6d6520746578742074686174206e6565647320746f20626520636f6e76657274656420746f2062696e617279'', ''some text that needs to be converted to clob'', ''some char text'', 1, null, (3, 4),''{"o":"p"}'');' + ''736f6d6520746578742074686174206e6565647320746f20626520636f6e76657274656420746f2062696e617279'', ''some text that needs to be converted to clob'', ''some char text'', 1, null, ARRAY[3, 4],''{"o":"p"}'');' - 'INSERT INTO table1 VALUES (109, 1, 45, 12000, 92233720, 17345667789.23, 56478356785.345, 56478356785.345, PARSEDATETIME(''12:45:35 GMT'', ''HH:mm:ss z''), PARSEDATETIME(''2018-02-12 GMT'', ''yyyy-MM-dd z''), PARSEDATETIME(''2018-02-12 12:45:35 GMT'', ''yyyy-MM-dd HH:mm:ss z''), ''736f6d6520746578742074686174206e6565647320746f20626520636f6e76657274656420746f2062696e617279'', ''some text that needs to be converted to varchar'', - ''736f6d6520746578742074686174206e6565647320746f20626520636f6e76657274656420746f2062696e617279'', ''some text that needs to be converted to clob'', ''some char text'', 1, null, (3),''{"q":"r"}'');' + ''736f6d6520746578742074686174206e6565647320746f20626520636f6e76657274656420746f2062696e617279'', ''some text that needs to be converted to clob'', ''some char text'', 1, null, ARRAY[3],''{"q":"r"}'');' - 'INSERT INTO table1 VALUES (110, 1, 45, 12000, 92233720, 17345667789.23, 56478356785.345, 56478356785.345, PARSEDATETIME(''12:45:35 GMT'', ''HH:mm:ss z''), PARSEDATETIME(''2018-02-12 GMT'', ''yyyy-MM-dd z''), PARSEDATETIME(''2018-02-12 12:45:35 GMT'', ''yyyy-MM-dd HH:mm:ss z''), ''736f6d6520746578742074686174206e6565647320746f20626520636f6e76657274656420746f2062696e617279'', ''some text that needs to be converted to varchar'', - ''736f6d6520746578742074686174206e6565647320746f20626520636f6e76657274656420746f2062696e617279'', ''some text that needs to be converted to clob'', ''some char text'', 1, null, (),''{"s":"t"}'');' + ''736f6d6520746578742074686174206e6565647320746f20626520636f6e76657274656420746f2062696e617279'', ''some text that needs to be converted to clob'', ''some char text'', 1, null, ARRAY[],''{"s":"t"}'');' query: 'select int_field1, bool_field2, tinyint_field3, smallint_field4, bigint_field5, decimal_field6, double_field7, real_field8, time_field9, date_field10, timestamp_field11, binary_field12, varchar_field13, blob_field14, clob_field15, char_field16, bit_field17, null_field18, list_field19, map_field20 from table1' diff --git a/java/adapter/jdbc/src/test/resources/h2/test1_all_datatypes_null_h2.yml b/java/adapter/jdbc/src/test/resources/h2/test1_all_datatypes_null_h2.yml index e1b1a1adcbb70..9be76229dab82 100644 --- a/java/adapter/jdbc/src/test/resources/h2/test1_all_datatypes_null_h2.yml +++ b/java/adapter/jdbc/src/test/resources/h2/test1_all_datatypes_null_h2.yml @@ -38,8 +38,8 @@ rowCount: '5' create: 'CREATE TABLE table1 (int_field1 INT, bool_field2 BOOLEAN, tinyint_field3 TINYINT, smallint_field4 SMALLINT, bigint_field5 BIGINT, decimal_field6 DECIMAL(20,2), double_field7 DOUBLE, real_field8 REAL, time_field9 TIME, date_field10 DATE, timestamp_field11 TIMESTAMP, - binary_field12 BINARY(100), varchar_field13 VARCHAR(256), blob_field14 BLOB, clob_field15 CLOB, char_field16 CHAR(16), bit_field17 BIT, - list_field19 ARRAY,map_field20 VARCHAR(256));' + binary_field12 VARBINARY(100), varchar_field13 VARCHAR(256), blob_field14 BLOB, clob_field15 CLOB, char_field16 CHAR(14), bit_field17 BIT, + list_field19 INT ARRAY, map_field20 VARCHAR(256));' data: - 'INSERT INTO table1 VALUES (null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null);' diff --git a/java/adapter/jdbc/src/test/resources/h2/test1_all_datatypes_selected_null_rows_h2.yml b/java/adapter/jdbc/src/test/resources/h2/test1_all_datatypes_selected_null_rows_h2.yml index 0521ce2f9c30d..fda31da150775 100644 --- a/java/adapter/jdbc/src/test/resources/h2/test1_all_datatypes_selected_null_rows_h2.yml +++ b/java/adapter/jdbc/src/test/resources/h2/test1_all_datatypes_selected_null_rows_h2.yml @@ -36,8 +36,8 @@ vectors: create: 'CREATE TABLE table1 (int_field1 INT, bool_field2 BOOLEAN, tinyint_field3 TINYINT, smallint_field4 SMALLINT, bigint_field5 BIGINT, decimal_field6 DECIMAL(20,2), double_field7 DOUBLE, real_field8 REAL, time_field9 TIME, date_field10 DATE, timestamp_field11 TIMESTAMP, - binary_field12 BINARY(100), varchar_field13 VARCHAR(256), blob_field14 BLOB, clob_field15 CLOB, char_field16 CHAR(16), bit_field17 BIT, - list_field19 ARRAY, map_field20 VARCHAR(256));' + binary_field12 VARBINARY(100), varchar_field13 VARCHAR(256), blob_field14 BLOB, clob_field15 CLOB, char_field16 CHAR(14), bit_field17 BIT, + list_field19 INT ARRAY, map_field20 VARCHAR(256));' data: - 'INSERT INTO table1 VALUES (null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null);' @@ -46,7 +46,7 @@ data: PARSEDATETIME(''2018-02-12 GMT'', ''yyyy-MM-dd z''), PARSEDATETIME(''2018-02-12 12:45:35 GMT'', ''yyyy-MM-dd HH:mm:ss z''), ''736f6d6520746578742074686174206e6565647320746f20626520636f6e76657274656420746f2062696e617279'', ''some text that needs to be converted to varchar'', ''736f6d6520746578742074686174206e6565647320746f20626520636f6e76657274656420746f2062696e617279'', ''some text that needs to be converted to clob'', ''some char text'', - 1, (1, 2, 3),''{"a":"b"}'');' + 1, ARRAY[1, 2, 3],''{"a":"b"}'');' - 'INSERT INTO table1 VALUES (null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null);' @@ -54,7 +54,7 @@ data: PARSEDATETIME(''2018-02-12 GMT'', ''yyyy-MM-dd z''), PARSEDATETIME(''2018-02-12 12:45:35 GMT'', ''yyyy-MM-dd HH:mm:ss z''), ''736f6d6520746578742074686174206e6565647320746f20626520636f6e76657274656420746f2062696e617279'', ''some text that needs to be converted to varchar'', ''736f6d6520746578742074686174206e6565647320746f20626520636f6e76657274656420746f2062696e617279'', ''some text that needs to be converted to clob'', ''some char text'', - 1, (1, 2, 3),''{"c":"d"}'');' + 1, ARRAY[1, 2, 3],''{"c":"d"}'');' - 'INSERT INTO table1 VALUES (null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null);' diff --git a/java/adapter/jdbc/src/test/resources/h2/test1_binary_h2.yml b/java/adapter/jdbc/src/test/resources/h2/test1_binary_h2.yml index ed94a7a189135..3d7b1ec658ef7 100644 --- a/java/adapter/jdbc/src/test/resources/h2/test1_binary_h2.yml +++ b/java/adapter/jdbc/src/test/resources/h2/test1_binary_h2.yml @@ -15,7 +15,7 @@ type: 'binary' vector: 'BINARY_FIELD12' -create: 'CREATE TABLE table1 (binary_field12 BINARY(100));' +create: 'CREATE TABLE table1 (binary_field12 VARBINARY(100));' data: - 'INSERT INTO table1 VALUES (''736f6d6520746578742074686174206e6565647320746f20626520636f6e76657274656420746f2062696e617279'');' diff --git a/java/adapter/jdbc/src/test/resources/h2/test1_char_h2.yml b/java/adapter/jdbc/src/test/resources/h2/test1_char_h2.yml index 018fe46c3ed53..588df7bff4df6 100644 --- a/java/adapter/jdbc/src/test/resources/h2/test1_char_h2.yml +++ b/java/adapter/jdbc/src/test/resources/h2/test1_char_h2.yml @@ -15,7 +15,7 @@ type: 'char' vector: 'CHAR_FIELD16' -create: 'CREATE TABLE table1 (char_field16 CHAR(16));' +create: 'CREATE TABLE table1 (char_field16 CHAR(14));' data: - 'INSERT INTO table1 VALUES (''some char text'');' diff --git a/java/adapter/jdbc/src/test/resources/h2/test1_charset_ch_h2.yml b/java/adapter/jdbc/src/test/resources/h2/test1_charset_ch_h2.yml index 1a82fa60a0b97..2e60a4af5a970 100644 --- a/java/adapter/jdbc/src/test/resources/h2/test1_charset_ch_h2.yml +++ b/java/adapter/jdbc/src/test/resources/h2/test1_charset_ch_h2.yml @@ -22,7 +22,7 @@ rowCount: '5' charSet: 'GBK' -create: 'CREATE TABLE table1 (int_field1 INT, varchar_field13 VARCHAR(256), clob_field15 CLOB, char_field16 CHAR(128));' +create: 'CREATE TABLE table1 (int_field1 INT, varchar_field13 VARCHAR(256), clob_field15 CLOB, char_field16 CHAR(13));' data: - 'INSERT INTO table1 VALUES (101,''一些帶有char編碼的文本需要轉換為varchar'', ''一些带有char编码的文本需要转换为clob'', ''一些char编码的字符文本'');' diff --git a/java/adapter/jdbc/src/test/resources/h2/test1_charset_h2.yml b/java/adapter/jdbc/src/test/resources/h2/test1_charset_h2.yml index 42f088e18d931..383681e5b3b41 100644 --- a/java/adapter/jdbc/src/test/resources/h2/test1_charset_h2.yml +++ b/java/adapter/jdbc/src/test/resources/h2/test1_charset_h2.yml @@ -20,7 +20,7 @@ vectors: rowCount: '10' -create: 'CREATE TABLE table1 (int_field1 INT, varchar_field13 VARCHAR(256), clob_field15 CLOB, char_field16 CHAR(128));' +create: 'CREATE TABLE table1 (int_field1 INT, varchar_field13 VARCHAR(256), clob_field15 CLOB, char_field16 CHAR(33));' data: - 'INSERT INTO table1 VALUES (101,''some text with char encoding that needs to be converted to varchar'', ''some text with char encoding that needs to be converted to clob'', ''some char text with char encoding'');' diff --git a/java/adapter/jdbc/src/test/resources/h2/test1_charset_jp_h2.yml b/java/adapter/jdbc/src/test/resources/h2/test1_charset_jp_h2.yml index 2eae4019277ea..9b3cf9a18fe01 100644 --- a/java/adapter/jdbc/src/test/resources/h2/test1_charset_jp_h2.yml +++ b/java/adapter/jdbc/src/test/resources/h2/test1_charset_jp_h2.yml @@ -22,7 +22,7 @@ rowCount: '5' charSet: 'SJIS' -create: 'CREATE TABLE table1 (int_field1 INT, varchar_field13 VARCHAR(256), clob_field15 CLOB, char_field16 CHAR(128));' +create: 'CREATE TABLE table1 (int_field1 INT, varchar_field13 VARCHAR(256), clob_field15 CLOB, char_field16 CHAR(23));' data: - 'INSERT INTO table1 VALUES (101,''varcharに変換する必要があるcharエンコーディングのテキスト'', ''charエンコーディングのあるテキストをclobに変換する必要がある'', ''charエンコーディングのあるcharテキスト'');' diff --git a/java/adapter/jdbc/src/test/resources/h2/test1_charset_kr_h2.yml b/java/adapter/jdbc/src/test/resources/h2/test1_charset_kr_h2.yml index c6b6ee0551a36..d6e051c094fbe 100644 --- a/java/adapter/jdbc/src/test/resources/h2/test1_charset_kr_h2.yml +++ b/java/adapter/jdbc/src/test/resources/h2/test1_charset_kr_h2.yml @@ -22,7 +22,7 @@ rowCount: '5' charSet: 'EUC-KR' -create: 'CREATE TABLE table1 (int_field1 INT, varchar_field13 VARCHAR(256), clob_field15 CLOB, char_field16 CHAR(128));' +create: 'CREATE TABLE table1 (int_field1 INT, varchar_field13 VARCHAR(256), clob_field15 CLOB, char_field16 CHAR(22));' data: - 'INSERT INTO table1 VALUES (101,''char 인코딩을 사용하는 일부 텍스트를 varchar로 변환해야합니다.'', ''clob로 변환해야하는 char 인코딩을 가진 텍스트'', ''char 인코딩을 사용한 char 텍스트'');' diff --git a/java/adapter/jdbc/src/test/resources/h2/test1_list_h2.yml b/java/adapter/jdbc/src/test/resources/h2/test1_list_h2.yml index 1314c49bf70fa..044c22182af58 100644 --- a/java/adapter/jdbc/src/test/resources/h2/test1_list_h2.yml +++ b/java/adapter/jdbc/src/test/resources/h2/test1_list_h2.yml @@ -15,19 +15,19 @@ type: 'list' vector: 'LIST_FIELD19' -create: 'CREATE TABLE table1 (list_field19 ARRAY);' +create: 'CREATE TABLE table1 (list_field19 INT ARRAY);' data: - - 'INSERT INTO table1 VALUES ((1,2,3));' - - 'INSERT INTO table1 VALUES ((1,2,3));' - - 'INSERT INTO table1 VALUES ((1,2,3));' - - 'INSERT INTO table1 VALUES ((1,2,3));' - - 'INSERT INTO table1 VALUES ((1,2,3));' - - 'INSERT INTO table1 VALUES ((1,2,3));' - - 'INSERT INTO table1 VALUES ((1,2,3));' - - 'INSERT INTO table1 VALUES ((1,2,3));' - - 'INSERT INTO table1 VALUES ((1,2,3));' - - 'INSERT INTO table1 VALUES ((1,2,3));' + - 'INSERT INTO table1 VALUES (ARRAY[1, 2, 3]);' + - 'INSERT INTO table1 VALUES (ARRAY[1, 2, 3]);' + - 'INSERT INTO table1 VALUES (ARRAY[1, 2, 3]);' + - 'INSERT INTO table1 VALUES (ARRAY[1, 2, 3]);' + - 'INSERT INTO table1 VALUES (ARRAY[1, 2, 3]);' + - 'INSERT INTO table1 VALUES (ARRAY[1, 2, 3]);' + - 'INSERT INTO table1 VALUES (ARRAY[1, 2, 3]);' + - 'INSERT INTO table1 VALUES (ARRAY[1, 2, 3]);' + - 'INSERT INTO table1 VALUES (ARRAY[1, 2, 3]);' + - 'INSERT INTO table1 VALUES (ARRAY[1, 2, 3]);' query: 'select list_field19 from table1;' diff --git a/java/adapter/jdbc/src/test/resources/h2/test1_selected_datatypes_null_h2.yml b/java/adapter/jdbc/src/test/resources/h2/test1_selected_datatypes_null_h2.yml index e8d1d5de02c63..60a4462272c7f 100644 --- a/java/adapter/jdbc/src/test/resources/h2/test1_selected_datatypes_null_h2.yml +++ b/java/adapter/jdbc/src/test/resources/h2/test1_selected_datatypes_null_h2.yml @@ -34,8 +34,8 @@ rowCount: '5' create: 'CREATE TABLE table1 (int_field1 INT, bool_field2 BOOLEAN, tinyint_field3 TINYINT, smallint_field4 SMALLINT, bigint_field5 BIGINT, decimal_field6 DECIMAL(20,2), double_field7 DOUBLE, real_field8 REAL, time_field9 TIME, date_field10 DATE, timestamp_field11 TIMESTAMP, - binary_field12 BINARY(100), varchar_field13 VARCHAR(256), blob_field14 BLOB, clob_field15 CLOB, char_field16 CHAR(16), bit_field17 BIT, - list_field19 ARRAY, map_field20 VARCHAR(256));' + binary_field12 VARBINARY(100), varchar_field13 VARCHAR(256), blob_field14 BLOB, clob_field15 CLOB, char_field16 CHAR(14), bit_field17 BIT, + list_field19 INT ARRAY, map_field20 VARCHAR(256));' data: - 'INSERT INTO table1 (int_field1, bool_field2, tinyint_field3, smallint_field4) VALUES (102, 0, 46, 12001);' diff --git a/java/performance/pom.xml b/java/performance/pom.xml index 269ac72d83326..a3e4da85b4321 100644 --- a/java/performance/pom.xml +++ b/java/performance/pom.xml @@ -64,7 +64,7 @@ com.h2database h2 - 1.4.196 + 2.2.224 test From 50cc141310f5ebb10d018f8e6416fa92ec28a91b Mon Sep 17 00:00:00 2001 From: Hyunseok Seo Date: Thu, 14 Dec 2023 17:35:31 +0900 Subject: [PATCH 043/570] MINOR: [C++] Use Cast() instead of CastTo() for Timestamp Scalar in test (#39226) ### Rationale for this change Remove legacy code This is a sub-PR of the PR mentioned below. * #39060 ### What changes are included in this PR? * Replace the legacy scalar `CastTo` implementation for Timestamp Scalar in test. It was supposed to be resolved in the mentioned PR, but it was missed. ### Are these changes tested? Yes. It is passed by existing test cases. ### Are there any user-facing changes? No. Authored-by: Hyunseok Seo Signed-off-by: Sutou Kouhei --- cpp/src/arrow/scalar_test.cc | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc index 9d40e688f1dfb..ac740f92c8527 100644 --- a/cpp/src/arrow/scalar_test.cc +++ b/cpp/src/arrow/scalar_test.cc @@ -878,17 +878,17 @@ TEST(TestTimestampScalars, Cast) { EXPECT_EQ(convert(TimeUnit::MICRO, TimeUnit::MILLI, 4567), 4); ASSERT_OK_AND_ASSIGN(auto str, - TimestampScalar(1024, timestamp(TimeUnit::MILLI)).CastTo(utf8())); - EXPECT_EQ(*str, StringScalar("1970-01-01 00:00:01.024")); + Cast(TimestampScalar(1024, timestamp(TimeUnit::MILLI)), utf8())); + EXPECT_EQ(*str.scalar(), StringScalar("1970-01-01 00:00:01.024")); ASSERT_OK_AND_ASSIGN(auto i64, - TimestampScalar(1024, timestamp(TimeUnit::MILLI)).CastTo(int64())); - EXPECT_EQ(*i64, Int64Scalar(1024)); + Cast(TimestampScalar(1024, timestamp(TimeUnit::MILLI)), int64())); + EXPECT_EQ(*i64.scalar(), Int64Scalar(1024)); constexpr int64_t kMillisecondsInDay = 86400000; - ASSERT_OK_AND_ASSIGN( - auto d64, TimestampScalar(1024 * kMillisecondsInDay + 3, timestamp(TimeUnit::MILLI)) - .CastTo(date64())); - EXPECT_EQ(*d64, Date64Scalar(1024 * kMillisecondsInDay)); + ASSERT_OK_AND_ASSIGN(auto d64, Cast(TimestampScalar(1024 * kMillisecondsInDay + 3, + timestamp(TimeUnit::MILLI)), + date64())); + EXPECT_EQ(*d64.scalar(), Date64Scalar(1024 * kMillisecondsInDay)); } TEST(TestDurationScalars, Basics) { From 3236c129d1cbe3f73359278d1459a3f20e5c4df0 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Thu, 14 Dec 2023 14:12:17 +0000 Subject: [PATCH 044/570] GH-36441: [Python] Make `CacheOptions` configurable from Python (#36627) ### Rationale for this change Resolves: https://github.com/apache/arrow/issues/36441 ### What changes are included in this PR? - Add python bindings for `CacheOptions` from the C++ side. - Allow setting `cache_options` on `ParquetFragmentScanOptions` from the python side. - Adjust some of the comments on `CacheOptions` ### Are these changes tested? Yes. I added python side tests for these newly available configs similar to other configs. I have not added an integration test that ensures setting the configs on the python side leads to correctly using them on the C++ side. ### Are there any user-facing changes? Yes. The are new configs available on the python side but the defaults are unchanged. I've added/updated docstrings where relevant. * Closes: #36441 Lead-authored-by: Thomas Newton Co-authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- cpp/src/arrow/io/caching.h | 10 +- python/pyarrow/__init__.py | 2 +- python/pyarrow/_dataset_parquet.pyx | 21 ++++- python/pyarrow/_parquet.pxd | 6 +- python/pyarrow/includes/libarrow.pxd | 16 ++++ python/pyarrow/io.pxi | 134 +++++++++++++++++++++++++++ python/pyarrow/lib.pxd | 12 +++ python/pyarrow/tests/test_dataset.py | 28 ++++-- python/pyarrow/tests/test_io.py | 59 ++++++++++++ 9 files changed, 271 insertions(+), 17 deletions(-) diff --git a/cpp/src/arrow/io/caching.h b/cpp/src/arrow/io/caching.h index 9c1b8fe88b3bd..e2b911fafdbbc 100644 --- a/cpp/src/arrow/io/caching.h +++ b/cpp/src/arrow/io/caching.h @@ -42,6 +42,11 @@ struct ARROW_EXPORT CacheOptions { /// size greater than this, they are not combined int64_t range_size_limit; /// \brief A lazy cache does not perform any I/O until requested. + /// lazy = false: request all byte ranges when PreBuffer or WillNeed is called. + /// lazy = True, prefetch_limit = 0: request merged byte ranges only after the reader + /// needs them. + /// lazy = True, prefetch_limit = k: prefetch up to k merged byte ranges ahead of the + /// range that is currently being read. bool lazy; /// \brief The maximum number of ranges to be prefetched. This is only used /// for lazy cache to asynchronously read some ranges after reading the target range. @@ -56,9 +61,10 @@ struct ARROW_EXPORT CacheOptions { /// \brief Construct CacheOptions from network storage metrics (e.g. S3). /// /// \param[in] time_to_first_byte_millis Seek-time or Time-To-First-Byte (TTFB) in - /// milliseconds, also called call setup latency of a new S3 request. + /// milliseconds, also called call setup latency of a new read request. /// The value is a positive integer. - /// \param[in] transfer_bandwidth_mib_per_sec Data transfer Bandwidth (BW) in MiB/sec. + /// \param[in] transfer_bandwidth_mib_per_sec Data transfer Bandwidth (BW) in MiB/sec + /// (per connection). /// The value is a positive integer. /// \param[in] ideal_bandwidth_utilization_frac Transfer bandwidth utilization fraction /// (per connection) to maximize the net data load. diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index cd66abcb44840..9da94885ec6b2 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -243,7 +243,7 @@ def print_entry(label, value): # I/O from pyarrow.lib import (NativeFile, PythonFile, - BufferedInputStream, BufferedOutputStream, + BufferedInputStream, BufferedOutputStream, CacheOptions, CompressedInputStream, CompressedOutputStream, TransformInputStream, transcoding_input_stream, FixedSizeBufferWriter, diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index d458ac4ee710d..61e051f56cfb0 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -42,6 +42,7 @@ from pyarrow._dataset cimport ( FileWriteOptions, Fragment, FragmentScanOptions, + CacheOptions, Partitioning, PartitioningFactory, WrittenFile @@ -693,6 +694,10 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): parallel using a background I/O thread pool. Set to False if you want to prioritize minimal memory usage over maximum speed. + cache_options : pyarrow.CacheOptions, default None + Cache options used when pre_buffer is enabled. The default values should + be good for most use cases. You may want to adjust these for example if + you have exceptionally high latency to the file system. thrift_string_size_limit : int, default None If not None, override the maximum total string size allocated when decoding Thrift structures. The default limit should be @@ -714,6 +719,7 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): def __init__(self, *, bint use_buffered_stream=False, buffer_size=8192, bint pre_buffer=True, + cache_options=None, thrift_string_size_limit=None, thrift_container_size_limit=None, decryption_config=None, @@ -723,6 +729,8 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): self.use_buffered_stream = use_buffered_stream self.buffer_size = buffer_size self.pre_buffer = pre_buffer + if cache_options is not None: + self.cache_options = cache_options if thrift_string_size_limit is not None: self.thrift_string_size_limit = thrift_string_size_limit if thrift_container_size_limit is not None: @@ -770,6 +778,14 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): def pre_buffer(self, bint pre_buffer): self.arrow_reader_properties().set_pre_buffer(pre_buffer) + @property + def cache_options(self): + return CacheOptions.wrap(self.arrow_reader_properties().cache_options()) + + @cache_options.setter + def cache_options(self, CacheOptions options): + self.arrow_reader_properties().set_cache_options(options.unwrap()) + @property def thrift_string_size_limit(self): return self.reader_properties().thrift_string_size_limit() @@ -828,11 +844,11 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): bool """ attrs = ( - self.use_buffered_stream, self.buffer_size, self.pre_buffer, + self.use_buffered_stream, self.buffer_size, self.pre_buffer, self.cache_options, self.thrift_string_size_limit, self.thrift_container_size_limit, self.page_checksum_verification) other_attrs = ( - other.use_buffered_stream, other.buffer_size, other.pre_buffer, + other.use_buffered_stream, other.buffer_size, other.pre_buffer, other.cache_options, other.thrift_string_size_limit, other.thrift_container_size_limit, other.page_checksum_verification) return attrs == other_attrs @@ -849,6 +865,7 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): use_buffered_stream=self.use_buffered_stream, buffer_size=self.buffer_size, pre_buffer=self.pre_buffer, + cache_options=self.cache_options, thrift_string_size_limit=self.thrift_string_size_limit, thrift_container_size_limit=self.thrift_container_size_limit, page_checksum_verification=self.page_checksum_verification diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index 59b50ceda8c40..7ce747e0aa46d 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -21,8 +21,8 @@ from pyarrow.includes.common cimport * from pyarrow.includes.libarrow cimport (CChunkedArray, CScalar, CSchema, CStatus, CTable, CMemoryPool, CBuffer, - CKeyValueMetadata, - CRandomAccessFile, COutputStream, + CKeyValueMetadata, CRandomAccessFile, + COutputStream, CCacheOptions, TimeUnit, CRecordBatchReader) from pyarrow.lib cimport _Weakrefable @@ -393,6 +393,8 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" nogil: int64_t batch_size() void set_pre_buffer(c_bool pre_buffer) c_bool pre_buffer() const + void set_cache_options(CCacheOptions options) + CCacheOptions cache_options() const void set_coerce_int96_timestamp_unit(TimeUnit unit) TimeUnit coerce_int96_timestamp_unit() const diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index b0b89f8614f18..403846a38f3fd 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1347,6 +1347,22 @@ cdef extern from "arrow/io/api.h" namespace "arrow::io" nogil: CStatus Write(const uint8_t* data, int64_t nbytes) CStatus Flush() + cdef cppclass CCacheOptions "arrow::io::CacheOptions": + int64_t hole_size_limit + int64_t range_size_limit + c_bool lazy + int64_t prefetch_limit + c_bool Equals "operator==" (CCacheOptions other) + + @staticmethod + CCacheOptions MakeFromNetworkMetrics(int64_t time_to_first_byte_millis, + int64_t transfer_bandwidth_mib_per_sec, + double ideal_bandwidth_utilization_frac, + int64_t max_ideal_request_size_mib) + + @staticmethod + CCacheOptions LazyDefaults() + cdef cppclass COutputStream" arrow::io::OutputStream"(FileInterface, Writable): pass diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index 6f3916640199a..1897e76efc2a0 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -2122,6 +2122,140 @@ cdef CCompressionType _ensure_compression(str name) except *: raise ValueError('Invalid value for compression: {!r}'.format(name)) +cdef class CacheOptions(_Weakrefable): + """ + Cache options for a pre-buffered fragment scan. + + Parameters + ---------- + hole_size_limit : int, default 8KiB + The maximum distance in bytes between two consecutive ranges; beyond + this value, ranges are not combined. + range_size_limit : int, default 32MiB + The maximum size in bytes of a combined range; if combining two + consecutive ranges would produce a range of a size greater than this, + they are not combined + lazy : bool, default True + lazy = false: request all byte ranges when PreBuffer or WillNeed is called. + lazy = True, prefetch_limit = 0: request merged byte ranges only after the reader + needs them. + lazy = True, prefetch_limit = k: prefetch up to k merged byte ranges ahead of the + range that is currently being read. + prefetch_limit : int, default 0 + The maximum number of ranges to be prefetched. This is only used for + lazy cache to asynchronously read some ranges after reading the target + range. + """ + + def __init__(self, *, hole_size_limit=None, range_size_limit=None, lazy=None, prefetch_limit=None): + self.wrapped = CCacheOptions.LazyDefaults() + if hole_size_limit is not None: + self.hole_size_limit = hole_size_limit + if range_size_limit is not None: + self.range_size_limit = range_size_limit + if lazy is not None: + self.lazy = lazy + if prefetch_limit is not None: + self.prefetch_limit = prefetch_limit + + cdef void init(self, CCacheOptions options): + self.wrapped = options + + cdef inline CCacheOptions unwrap(self): + return self.wrapped + + @staticmethod + cdef wrap(CCacheOptions options): + self = CacheOptions() + self.init(options) + return self + + @property + def hole_size_limit(self): + return self.wrapped.hole_size_limit + + @hole_size_limit.setter + def hole_size_limit(self, hole_size_limit): + self.wrapped.hole_size_limit = hole_size_limit + + @property + def range_size_limit(self): + return self.wrapped.range_size_limit + + @range_size_limit.setter + def range_size_limit(self, range_size_limit): + self.wrapped.range_size_limit = range_size_limit + + @property + def lazy(self): + return self.wrapped.lazy + + @lazy.setter + def lazy(self, lazy): + self.wrapped.lazy = lazy + + @property + def prefetch_limit(self): + return self.wrapped.prefetch_limit + + @prefetch_limit.setter + def prefetch_limit(self, prefetch_limit): + self.wrapped.prefetch_limit = prefetch_limit + + def __eq__(self, CacheOptions other): + try: + return self.unwrap().Equals(other.unwrap()) + except TypeError: + return False + + @staticmethod + def from_network_metrics(time_to_first_byte_millis, transfer_bandwidth_mib_per_sec, + ideal_bandwidth_utilization_frac=0.9, max_ideal_request_size_mib=64): + """ + Create suiteable CacheOptions based on provided network metrics. + + Typically this will be used with object storage solutions like Amazon S3, + Google Cloud Storage and Azure Blob Storage. + + Parameters + ---------- + time_to_first_byte_millis : int + Seek-time or Time-To-First-Byte (TTFB) in milliseconds, also called call + setup latency of a new read request. The value is a positive integer. + transfer_bandwidth_mib_per_sec : int + Data transfer Bandwidth (BW) in MiB/sec (per connection). The value is a positive + integer. + ideal_bandwidth_utilization_frac : int, default 0.9 + Transfer bandwidth utilization fraction (per connection) to maximize the net + data load. The value is a positive float less than 1. + max_ideal_request_size_mib : int, default 64 + The maximum single data request size (in MiB) to maximize the net data load. + + Returns + ------- + CacheOptions + """ + return CacheOptions.wrap(CCacheOptions.MakeFromNetworkMetrics( + time_to_first_byte_millis, transfer_bandwidth_mib_per_sec, + ideal_bandwidth_utilization_frac, max_ideal_request_size_mib)) + + @staticmethod + @binding(True) # Required for Cython < 3 + def _reconstruct(kwargs): + # __reduce__ doesn't allow passing named arguments directly to the + # reconstructor, hence this wrapper. + return CacheOptions(**kwargs) + + def __reduce__(self): + kwargs = dict( + hole_size_limit=self.hole_size_limit, + range_size_limit=self.range_size_limit, + lazy=self.lazy, + prefetch_limit=self.prefetch_limit, + ) + return CacheOptions._reconstruct, (kwargs,) + + cdef class Codec(_Weakrefable): """ Compression codec. diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 1440ba0750094..58ec34addbc0a 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -561,6 +561,18 @@ cdef class RecordBatchReader(_Weakrefable): SharedPtrNoGIL[CRecordBatchReader] reader +cdef class CacheOptions(_Weakrefable): + cdef: + CCacheOptions wrapped + + cdef void init(self, CCacheOptions options) + + cdef inline CCacheOptions unwrap(self) + + @staticmethod + cdef wrap(const CCacheOptions options) + + cdef class Codec(_Weakrefable): cdef: shared_ptr[CCodec] wrapped diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index f3c25ee8c5c3b..a37eb1e426f7a 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -16,17 +16,16 @@ # under the License. import contextlib -import os -import posixpath import datetime +import os import pathlib +import posixpath import sys -import textwrap import tempfile +import textwrap import threading import time from shutil import copytree - from urllib.parse import quote import numpy as np @@ -35,12 +34,12 @@ import pyarrow as pa import pyarrow.compute as pc import pyarrow.csv -import pyarrow.json import pyarrow.feather import pyarrow.fs as fs -from pyarrow.tests.util import (change_cwd, _filesystem_uri, - FSProtocolClass, ProxyHandler, - _configure_s3_limited_user) +import pyarrow.json +from pyarrow.tests.util import (FSProtocolClass, ProxyHandler, + _configure_s3_limited_user, _filesystem_uri, + change_cwd) try: import pandas as pd @@ -138,7 +137,8 @@ def mockfs(): @pytest.fixture def open_logging_fs(monkeypatch): - from pyarrow.fs import PyFileSystem, LocalFileSystem + from pyarrow.fs import LocalFileSystem, PyFileSystem + from .test_fs import ProxyHandler localfs = LocalFileSystem() @@ -791,6 +791,9 @@ def test_parquet_scan_options(): thrift_container_size_limit=987654,) opts6 = ds.ParquetFragmentScanOptions( page_checksum_verification=True) + cache_opts = pa.CacheOptions( + hole_size_limit=2**10, range_size_limit=8*2**10, lazy=True) + opts7 = ds.ParquetFragmentScanOptions(pre_buffer=True, cache_options=cache_opts) assert opts1.use_buffered_stream is False assert opts1.buffer_size == 2**13 @@ -816,12 +819,17 @@ def test_parquet_scan_options(): assert opts6.page_checksum_verification is True + assert opts7.pre_buffer is True + assert opts7.cache_options == cache_opts + assert opts7.cache_options != opts1.cache_options + assert opts1 == opts1 assert opts1 != opts2 assert opts2 != opts3 assert opts3 != opts4 assert opts5 != opts1 assert opts6 != opts1 + assert opts7 != opts1 def test_file_format_pickling(pickle_module): @@ -2711,7 +2719,7 @@ def test_open_dataset_from_uri_s3_fsspec(s3_example_simple): table, path, _, _, host, port, access_key, secret_key = s3_example_simple s3fs = pytest.importorskip("s3fs") - from pyarrow.fs import PyFileSystem, FSSpecHandler + from pyarrow.fs import FSSpecHandler, PyFileSystem fs = s3fs.S3FileSystem( key=access_key, diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py index 071962af290fc..5a495aa80abdf 100644 --- a/python/pyarrow/tests/test_io.py +++ b/python/pyarrow/tests/test_io.py @@ -664,6 +664,65 @@ def test_allocate_buffer_resizable(): assert buf.size == 200 +def test_cache_options(): + opts1 = pa.CacheOptions() + opts2 = pa.CacheOptions(hole_size_limit=1024) + opts3 = pa.CacheOptions(hole_size_limit=4096, range_size_limit=8192) + opts4 = pa.CacheOptions(hole_size_limit=4096, + range_size_limit=8192, prefetch_limit=5) + opts5 = pa.CacheOptions(hole_size_limit=4096, + range_size_limit=8192, lazy=False) + opts6 = pa.CacheOptions.from_network_metrics(time_to_first_byte_millis=100, + transfer_bandwidth_mib_per_sec=200, + ideal_bandwidth_utilization_frac=0.9, + max_ideal_request_size_mib=64) + + assert opts1.hole_size_limit == 8192 + assert opts1.range_size_limit == 32 * 1024 * 1024 + assert opts1.lazy is True + assert opts1.prefetch_limit == 0 + + assert opts2.hole_size_limit == 1024 + assert opts2.range_size_limit == 32 * 1024 * 1024 + assert opts2.lazy is True + assert opts2.prefetch_limit == 0 + + assert opts3.hole_size_limit == 4096 + assert opts3.range_size_limit == 8192 + assert opts3.lazy is True + assert opts3.prefetch_limit == 0 + + assert opts4.hole_size_limit == 4096 + assert opts4.range_size_limit == 8192 + assert opts4.lazy is True + assert opts4.prefetch_limit == 5 + + assert opts5.hole_size_limit == 4096 + assert opts5.range_size_limit == 8192 + assert opts5.lazy is False + assert opts5.prefetch_limit == 0 + + assert opts6.lazy is False + + assert opts1 == opts1 + assert opts1 != opts2 + assert opts2 != opts3 + assert opts3 != opts4 + assert opts4 != opts5 + assert opts6 != opts1 + + +def test_cache_options_pickling(pickle_module): + options = [ + pa.CacheOptions(), + pa.CacheOptions(hole_size_limit=4096, range_size_limit=8192, + lazy=True, prefetch_limit=5), + ] + + for option in options: + assert pickle_module.loads(pickle_module.dumps(option)) == option + + @pytest.mark.parametrize("compression", [ pytest.param( "bz2", marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError) From cf1b265f4481f1a42ce8362db82c377fb659a363 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Thu, 14 Dec 2023 09:36:48 -0500 Subject: [PATCH 045/570] GH-38930: [Java] Fix spelling (#38931) ### Rationale for this change ### What changes are included in this PR? Spelling fixes to java/ ### Are these changes tested? ### Are there any user-facing changes? * Closes: #38930 Authored-by: Josh Soref <2119212+jsoref@users.noreply.github.com> Signed-off-by: David Li --- .../java/org/apache/arrow/AvroToArrow.java | 12 +++++------ .../adapter/jdbc/ArrowVectorIterator.java | 2 +- .../arrow/adapter/jdbc/JdbcFieldInfo.java | 2 +- .../arrow/adapter/jdbc/JdbcToArrow.java | 4 ++-- .../arrow/adapter/jdbc/JdbcToArrowConfig.java | 2 +- .../adapter/jdbc/AbstractJdbcToArrowTest.java | 20 +++++++++---------- .../adapter/jdbc/UnreliableMetaDataTest.java | 2 +- .../adapter/jdbc/h2/JdbcAliasToArrowTest.java | 2 +- .../main/java/org/apache/arrow/c/Data.java | 2 +- .../java/org/apache/arrow/c/NativeUtil.java | 4 ++-- .../arrow/vector/StructVectorLoader.java | 2 +- .../org/apache/arrow/c/DictionaryTest.java | 2 +- .../compression/TestCompressionCodec.java | 2 +- java/dataset/src/main/cpp/jni_util.cc | 6 +++--- java/dataset/src/main/cpp/jni_wrapper.cc | 4 ++-- .../apache/arrow/dataset/jni/JniWrapper.java | 2 +- .../arrow/dataset/substrait/JniWrapper.java | 2 +- .../arrow/flight/OutboundStreamListener.java | 2 +- .../flight/auth2/ClientHandshakeWrapper.java | 2 +- .../driver/jdbc/ArrowDatabaseMetadata.java | 10 +++++----- .../client/ArrowFlightSqlClientHandler.java | 6 +++--- .../jdbc/utils/AvaticaParameterBinder.java | 2 +- .../utils/VectorSchemaRootTransformer.java | 4 ++-- .../jdbc/ArrowFlightJdbcDriverTest.java | 6 +++--- .../jdbc/utils/MockFlightSqlProducer.java | 2 +- .../flight/sql/FlightSqlColumnMetadata.java | 8 ++++---- .../arrow/flight/sql/util/TableRef.java | 2 +- .../arrow/gandiva/evaluator/JniLoader.java | 6 +++--- .../arrow/gandiva/expression/InNode.java | 8 ++++---- .../arrow/memory/AllocationReservation.java | 2 +- .../org/apache/arrow/memory/ArrowBuf.java | 2 +- .../main/codegen/templates/BaseWriter.java | 4 ++++ .../codegen/templates/DenseUnionVector.java | 2 +- .../arrow/vector/BaseFixedWidthVector.java | 4 ++-- .../vector/BaseLargeVariableWidthVector.java | 2 +- .../arrow/vector/BaseVariableWidthVector.java | 2 +- .../apache/arrow/vector/Decimal256Vector.java | 2 +- .../apache/arrow/vector/DecimalVector.java | 4 ++-- .../arrow/vector/ExtensionTypeVector.java | 2 +- .../vector/IntervalMonthDayNanoVector.java | 2 +- .../org/apache/arrow/vector/VectorLoader.java | 2 +- .../arrow/vector/complex/StructVector.java | 4 ++-- .../complex/impl/ComplexWriterImpl.java | 2 +- .../impl/NullableStructWriterFactory.java | 2 +- .../complex/impl/StructOrListWriterImpl.java | 11 ++++++++++ .../arrow/vector/util/DecimalUtility.java | 4 ++-- .../arrow/vector/util/VectorAppender.java | 2 +- .../validate/ValidateVectorTypeVisitor.java | 2 +- .../arrow/vector/TestDecimal256Vector.java | 2 +- .../arrow/vector/TestDenseUnionVector.java | 6 +++--- .../arrow/vector/TestVectorReAlloc.java | 8 ++++---- .../vector/ipc/MessageSerializerTest.java | 4 ++-- .../vector/types/pojo/TestExtensionType.java | 2 +- 53 files changed, 112 insertions(+), 97 deletions(-) diff --git a/java/adapter/avro/src/main/java/org/apache/arrow/AvroToArrow.java b/java/adapter/avro/src/main/java/org/apache/arrow/AvroToArrow.java index 9fb5ce291fde8..33f180393780e 100644 --- a/java/adapter/avro/src/main/java/org/apache/arrow/AvroToArrow.java +++ b/java/adapter/avro/src/main/java/org/apache/arrow/AvroToArrow.java @@ -39,9 +39,9 @@ public class AvroToArrow { */ static VectorSchemaRoot avroToArrow(Schema schema, Decoder decoder, AvroToArrowConfig config) throws IOException { - Preconditions.checkNotNull(schema, "Avro schema object can not be null"); - Preconditions.checkNotNull(decoder, "Avro decoder object can not be null"); - Preconditions.checkNotNull(config, "config can not be null"); + Preconditions.checkNotNull(schema, "Avro schema object cannot be null"); + Preconditions.checkNotNull(decoder, "Avro decoder object cannot be null"); + Preconditions.checkNotNull(config, "config cannot be null"); return AvroToArrowUtils.avroToArrowVectors(schema, decoder, config); } @@ -58,9 +58,9 @@ public static AvroToArrowVectorIterator avroToArrowIterator( Decoder decoder, AvroToArrowConfig config) throws IOException { - Preconditions.checkNotNull(schema, "Avro schema object can not be null"); - Preconditions.checkNotNull(decoder, "Avro decoder object can not be null"); - Preconditions.checkNotNull(config, "config can not be null"); + Preconditions.checkNotNull(schema, "Avro schema object cannot be null"); + Preconditions.checkNotNull(decoder, "Avro decoder object cannot be null"); + Preconditions.checkNotNull(config, "config cannot be null"); return AvroToArrowVectorIterator.create(decoder, schema, config); } diff --git a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/ArrowVectorIterator.java b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/ArrowVectorIterator.java index 6e789009dd20a..632c7c474b4a9 100644 --- a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/ArrowVectorIterator.java +++ b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/ArrowVectorIterator.java @@ -53,7 +53,7 @@ public class ArrowVectorIterator implements Iterator, AutoClos private final int targetBatchSize; - // This is used to track whether the ResultSet has been fully read, and is needed spcifically for cases where there + // This is used to track whether the ResultSet has been fully read, and is needed specifically for cases where there // is a ResultSet having zero rows (empty): private boolean readComplete = false; diff --git a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcFieldInfo.java b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcFieldInfo.java index 97ca8f27ceb49..d16964ea14417 100644 --- a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcFieldInfo.java +++ b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcFieldInfo.java @@ -129,7 +129,7 @@ public JdbcFieldInfo(ResultSetMetaData rsmd, int column) throws SQLException { /** * Builds a JdbcFieldInfo from the corresponding row from a {@link java.sql.DatabaseMetaData#getColumns} - * ResulSet. + * ResultSet. * * @param rs The {@link java.sql.ResultSet} to get the field information from. * @throws SQLException If the column information cannot be retrieved. diff --git a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrow.java b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrow.java index daee64d93080a..246451b5b22f9 100644 --- a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrow.java +++ b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrow.java @@ -77,7 +77,7 @@ public static ArrowVectorIterator sqlToArrowVectorIterator( ResultSet resultSet, BufferAllocator allocator) throws SQLException, IOException { - Preconditions.checkNotNull(allocator, "Memory Allocator object can not be null"); + Preconditions.checkNotNull(allocator, "Memory Allocator object cannot be null"); JdbcToArrowConfig config = new JdbcToArrowConfig(allocator, JdbcToArrowUtils.getUtcCalendar()); @@ -96,7 +96,7 @@ public static ArrowVectorIterator sqlToArrowVectorIterator( ResultSet resultSet, JdbcToArrowConfig config) throws SQLException, IOException { - Preconditions.checkNotNull(resultSet, "JDBC ResultSet object can not be null"); + Preconditions.checkNotNull(resultSet, "JDBC ResultSet object cannot be null"); Preconditions.checkNotNull(config, "The configuration cannot be null"); return ArrowVectorIterator.create(resultSet, config); } diff --git a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrowConfig.java b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrowConfig.java index e23bad54afc14..68851f4a98bc9 100644 --- a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrowConfig.java +++ b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrowConfig.java @@ -121,7 +121,7 @@ public final class JdbcToArrowConfig { * @param reuseVectorSchemaRoot Whether to reuse the vector schema root for each data load. * @param arraySubTypesByColumnIndex The type of the JDBC array at the column index (1-based). * @param arraySubTypesByColumnName The type of the JDBC array at the column name. - * @param targetBatchSize The target batch size to be used in preallcation of the resulting vectors. + * @param targetBatchSize The target batch size to be used in preallocation of the resulting vectors. * @param jdbcToArrowTypeConverter The function that maps JDBC field type information to arrow type. If set to null, * the default mapping will be used, which is defined as: *
    diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/AbstractJdbcToArrowTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/AbstractJdbcToArrowTest.java index dc36ef9f8275b..88a66a31aa2c9 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/AbstractJdbcToArrowTest.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/AbstractJdbcToArrowTest.java @@ -183,7 +183,7 @@ public static Object[][] prepareTestData(String[] testFiles, @SuppressWarnings(" */ public VectorSchemaRoot sqlToArrow(Connection connection, String query, BufferAllocator allocator) throws SQLException, IOException { - Preconditions.checkNotNull(allocator, "Memory allocator object can not be null"); + Preconditions.checkNotNull(allocator, "Memory allocator object cannot be null"); JdbcToArrowConfig config = new JdbcToArrowConfigBuilder(allocator, JdbcToArrowUtils.getUtcCalendar()) .setArraySubTypeByColumnNameMap(ARRAY_SUB_TYPE_BY_COLUMN_NAME_MAP) @@ -212,8 +212,8 @@ public VectorSchemaRoot sqlToArrow( BufferAllocator allocator, Calendar calendar) throws SQLException, IOException { - Preconditions.checkNotNull(allocator, "Memory allocator object can not be null"); - Preconditions.checkNotNull(calendar, "Calendar object can not be null"); + Preconditions.checkNotNull(allocator, "Memory allocator object cannot be null"); + Preconditions.checkNotNull(calendar, "Calendar object cannot be null"); JdbcToArrowConfig config = new JdbcToArrowConfigBuilder(allocator, calendar) .setArraySubTypeByColumnNameMap(ARRAY_SUB_TYPE_BY_COLUMN_NAME_MAP) @@ -237,8 +237,8 @@ public VectorSchemaRoot sqlToArrow( */ public static VectorSchemaRoot sqlToArrow(Connection connection, String query, JdbcToArrowConfig config) throws SQLException, IOException { - Preconditions.checkNotNull(connection, "JDBC connection object can not be null"); - Preconditions.checkArgument(query != null && query.length() > 0, "SQL query can not be null or empty"); + Preconditions.checkNotNull(connection, "JDBC connection object cannot be null"); + Preconditions.checkArgument(query != null && query.length() > 0, "SQL query cannot be null or empty"); try (Statement stmt = connection.createStatement()) { return sqlToArrow(stmt.executeQuery(query), config); @@ -256,7 +256,7 @@ public static VectorSchemaRoot sqlToArrow(Connection connection, String query, J * @throws SQLException on error */ public static VectorSchemaRoot sqlToArrow(ResultSet resultSet) throws SQLException, IOException { - Preconditions.checkNotNull(resultSet, "JDBC ResultSet object can not be null"); + Preconditions.checkNotNull(resultSet, "JDBC ResultSet object cannot be null"); return sqlToArrow(resultSet, JdbcToArrowUtils.getUtcCalendar()); } @@ -273,7 +273,7 @@ public static VectorSchemaRoot sqlToArrow(ResultSet resultSet) throws SQLExcepti */ public static VectorSchemaRoot sqlToArrow(ResultSet resultSet, BufferAllocator allocator) throws SQLException, IOException { - Preconditions.checkNotNull(allocator, "Memory Allocator object can not be null"); + Preconditions.checkNotNull(allocator, "Memory Allocator object cannot be null"); JdbcToArrowConfig config = new JdbcToArrowConfigBuilder(allocator, JdbcToArrowUtils.getUtcCalendar()) .setArraySubTypeByColumnNameMap(ARRAY_SUB_TYPE_BY_COLUMN_NAME_MAP) @@ -292,7 +292,7 @@ public static VectorSchemaRoot sqlToArrow(ResultSet resultSet, BufferAllocator a * @throws SQLException on error */ public static VectorSchemaRoot sqlToArrow(ResultSet resultSet, Calendar calendar) throws SQLException, IOException { - Preconditions.checkNotNull(resultSet, "JDBC ResultSet object can not be null"); + Preconditions.checkNotNull(resultSet, "JDBC ResultSet object cannot be null"); JdbcToArrowConfig config = new JdbcToArrowConfigBuilder(new RootAllocator(Integer.MAX_VALUE), calendar) .setArraySubTypeByColumnNameMap(ARRAY_SUB_TYPE_BY_COLUMN_NAME_MAP) @@ -316,7 +316,7 @@ public static VectorSchemaRoot sqlToArrow( BufferAllocator allocator, Calendar calendar) throws SQLException, IOException { - Preconditions.checkNotNull(allocator, "Memory Allocator object can not be null"); + Preconditions.checkNotNull(allocator, "Memory Allocator object cannot be null"); JdbcToArrowConfig config = new JdbcToArrowConfigBuilder(allocator, calendar) .setArraySubTypeByColumnNameMap(ARRAY_SUB_TYPE_BY_COLUMN_NAME_MAP) @@ -336,7 +336,7 @@ public static VectorSchemaRoot sqlToArrow( */ public static VectorSchemaRoot sqlToArrow(ResultSet resultSet, JdbcToArrowConfig config) throws SQLException, IOException { - Preconditions.checkNotNull(resultSet, "JDBC ResultSet object can not be null"); + Preconditions.checkNotNull(resultSet, "JDBC ResultSet object cannot be null"); Preconditions.checkNotNull(config, "The configuration cannot be null"); VectorSchemaRoot root = VectorSchemaRoot.create( diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/UnreliableMetaDataTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/UnreliableMetaDataTest.java index 90554578d1f45..3eb886faabc10 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/UnreliableMetaDataTest.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/UnreliableMetaDataTest.java @@ -151,7 +151,7 @@ public void testInconsistentPrecisionAndScale() throws Exception { assertThrows(RuntimeException.class, iter::next, "This is expected to fail due to inconsistent BigDecimal scales, while strict matching is enabled."); } - // Reuse same ResultSet, with RoundingMode.UNNECESSARY set to coerce BigDecmial scale as needed: + // Reuse same ResultSet, with RoundingMode.UNNECESSARY set to coerce BigDecimal scale as needed: config = new JdbcToArrowConfigBuilder( allocator, JdbcToArrowUtils.getUtcCalendar(), /* include metadata */ false) .setReuseVectorSchemaRoot(reuseVectorSchemaRoot) diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcAliasToArrowTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcAliasToArrowTest.java index a6e6b22fcb45d..d9acfe88f4f8b 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcAliasToArrowTest.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcAliasToArrowTest.java @@ -63,7 +63,7 @@ public void setUp() throws Exception { /** * Test h2 database query with alias for column name and column label. - * To vetify reading field alias from an H2 database works as expected. + * To verify reading field alias from an H2 database works as expected. * If this test fails, something is either wrong with the setup, * or the H2 SQL behavior changed. */ diff --git a/java/c/src/main/java/org/apache/arrow/c/Data.java b/java/c/src/main/java/org/apache/arrow/c/Data.java index 6cb0c0ac40aca..a92853b3504f0 100644 --- a/java/c/src/main/java/org/apache/arrow/c/Data.java +++ b/java/c/src/main/java/org/apache/arrow/c/Data.java @@ -222,7 +222,7 @@ public static void exportVectorSchemaRoot(BufferAllocator allocator, VectorSchem /** * Export a reader as an ArrowArrayStream using the C Stream Interface. - * @param allocator Buffer allocator for allocating C data inteface fields + * @param allocator Buffer allocator for allocating C data interface fields * @param reader Reader to export * @param out C struct to export the stream */ diff --git a/java/c/src/main/java/org/apache/arrow/c/NativeUtil.java b/java/c/src/main/java/org/apache/arrow/c/NativeUtil.java index b152ea4e7c9fd..ba65fd80c4141 100644 --- a/java/c/src/main/java/org/apache/arrow/c/NativeUtil.java +++ b/java/c/src/main/java/org/apache/arrow/c/NativeUtil.java @@ -115,7 +115,7 @@ public static void closeBuffer(ArrowBuf buf) { * Get the address of a buffer or {@value #NULL} if the input buffer is null. * * @param buf Buffer to get the address of - * @return Memory addresss or {@value #NULL} + * @return Memory address or {@value #NULL} */ public static long addressOrNull(ArrowBuf buf) { if (buf == null) { @@ -129,7 +129,7 @@ public static long addressOrNull(ArrowBuf buf) { * struct is null. * * @param struct C Data Interface struct to get the address of - * @return Memory addresss or {@value #NULL} + * @return Memory address or {@value #NULL} */ public static long addressOrNull(BaseStruct struct) { if (struct == null) { diff --git a/java/c/src/main/java/org/apache/arrow/vector/StructVectorLoader.java b/java/c/src/main/java/org/apache/arrow/vector/StructVectorLoader.java index 10e35701776ee..4a62be7851ac7 100644 --- a/java/c/src/main/java/org/apache/arrow/vector/StructVectorLoader.java +++ b/java/c/src/main/java/org/apache/arrow/vector/StructVectorLoader.java @@ -101,7 +101,7 @@ public StructVector load(BufferAllocator allocator, ArrowRecordBatch recordBatch private void loadBuffers(FieldVector vector, Field field, Iterator buffers, Iterator nodes, CompressionCodec codec) { - checkArgument(nodes.hasNext(), "no more field nodes for for field %s and vector %s", field, vector); + checkArgument(nodes.hasNext(), "no more field nodes for field %s and vector %s", field, vector); ArrowFieldNode fieldNode = nodes.next(); int bufferLayoutCount = TypeLayout.getTypeBufferCount(field.getType()); List ownBuffers = new ArrayList<>(bufferLayoutCount); diff --git a/java/c/src/test/java/org/apache/arrow/c/DictionaryTest.java b/java/c/src/test/java/org/apache/arrow/c/DictionaryTest.java index 9dcb262af4616..d892781756ede 100644 --- a/java/c/src/test/java/org/apache/arrow/c/DictionaryTest.java +++ b/java/c/src/test/java/org/apache/arrow/c/DictionaryTest.java @@ -128,7 +128,7 @@ public void testRoundtripMultipleBatches() throws IOException { ArrowSchema consumerArrowSchema = ArrowSchema.allocateNew(allocator)) { // Load first batch reader.loadNextBatch(); - // Producer fills consumer schema stucture + // Producer fills consumer schema structure Data.exportSchema(allocator, reader.getVectorSchemaRoot().getSchema(), reader, consumerArrowSchema); // Consumer loads it as an empty vector schema root try (CDataDictionaryProvider consumerDictionaryProvider = new CDataDictionaryProvider(); diff --git a/java/compression/src/test/java/org/apache/arrow/compression/TestCompressionCodec.java b/java/compression/src/test/java/org/apache/arrow/compression/TestCompressionCodec.java index 7db00cfde485d..403130edba52e 100644 --- a/java/compression/src/test/java/org/apache/arrow/compression/TestCompressionCodec.java +++ b/java/compression/src/test/java/org/apache/arrow/compression/TestCompressionCodec.java @@ -317,7 +317,7 @@ void withRoot(CompressionUtil.CodecType codec, BiConsumerFindClass(class_name.c_str()); - DCHECK_NE(jclass, nullptr) << "Could not find Java class " << class_name; - return env->IsInstanceOf(t, jclass); + jclass java_class = env->FindClass(class_name.c_str()); + DCHECK_NE(java_class, nullptr) << "Could not find Java class " << class_name; + return env->IsInstanceOf(t, java_class); } arrow::StatusCode MapJavaError(JNIEnv* env, jthrowable t) { diff --git a/java/dataset/src/main/cpp/jni_wrapper.cc b/java/dataset/src/main/cpp/jni_wrapper.cc index 49e0f1720909f..d2d976677bd6b 100644 --- a/java/dataset/src/main/cpp/jni_wrapper.cc +++ b/java/dataset/src/main/cpp/jni_wrapper.cc @@ -282,7 +282,7 @@ std::unordered_map> LoadNamedTables(J std::unordered_map> map_table_to_record_batch_reader; int length = env->GetArrayLength(str_array); if (length % 2 != 0) { - JniThrow("Can not map odd number of array elements to key/value pairs"); + JniThrow("Cannot map odd number of array elements to key/value pairs"); } std::shared_ptr output_table; for (int pos = 0; pos < length; pos++) { @@ -399,7 +399,7 @@ JNIEXPORT jlong JNICALL Java_org_apache_arrow_dataset_jni_NativeMemoryPool_bytes JNI_METHOD_START arrow::MemoryPool* pool = reinterpret_cast(memory_pool_id); if (pool == nullptr) { - JniThrow("Memory pool instance not found. It may not exist nor has been closed"); + JniThrow("Memory pool instance not found. It may not exist or have been closed"); } return pool->bytes_allocated(); JNI_METHOD_END(-1L) diff --git a/java/dataset/src/main/java/org/apache/arrow/dataset/jni/JniWrapper.java b/java/dataset/src/main/java/org/apache/arrow/dataset/jni/JniWrapper.java index a7df5be42f13b..637a3e8f22a9a 100644 --- a/java/dataset/src/main/java/org/apache/arrow/dataset/jni/JniWrapper.java +++ b/java/dataset/src/main/java/org/apache/arrow/dataset/jni/JniWrapper.java @@ -116,7 +116,7 @@ public native long createScanner(long datasetId, String[] columns, ByteBuffer su public native void releaseBuffer(long bufferId); /** - * Ensure the S3 APIs are shutdown, but only if not already done. If the S3 APIs are unintialized, + * Ensure the S3 APIs are shutdown, but only if not already done. If the S3 APIs are uninitialized, * then this is a noop. */ public native void ensureS3Finalized(); diff --git a/java/dataset/src/main/java/org/apache/arrow/dataset/substrait/JniWrapper.java b/java/dataset/src/main/java/org/apache/arrow/dataset/substrait/JniWrapper.java index 236d1d5616061..5cb68f8514678 100644 --- a/java/dataset/src/main/java/org/apache/arrow/dataset/substrait/JniWrapper.java +++ b/java/dataset/src/main/java/org/apache/arrow/dataset/substrait/JniWrapper.java @@ -25,7 +25,7 @@ * Class that contains Native methods to call Acero C++ Substrait API. It internally depends on C++ function * arrow::engine::ExecuteSerializedPlan. Currently supported input parameters supported are: *
    - * - arrow::Buffer: Susbtrait Plan (JSON or Binary format).
    + * - arrow::Buffer: Substrait Plan (JSON or Binary format).
      * - arrow::engine::ConversionOptions: Mapping for arrow::engine::NamedTableProvider.
      * 
    */ diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/OutboundStreamListener.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/OutboundStreamListener.java index 38a44d0e5913f..e80fb41c67273 100644 --- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/OutboundStreamListener.java +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/OutboundStreamListener.java @@ -106,7 +106,7 @@ default void start(VectorSchemaRoot root, DictionaryProvider dictionaries) { void completed(); /** - * Toggle whether to ues the zero-copy write optimization. + * Toggle whether to use the zero-copy write optimization. * *

    By default or when disabled, Arrow may copy data into a buffer for the underlying implementation to * send. When enabled, Arrow will instead try to directly enqueue the Arrow buffer for sending. Not all diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/auth2/ClientHandshakeWrapper.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/auth2/ClientHandshakeWrapper.java index 16a5142509d4d..c84739d2e345c 100644 --- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/auth2/ClientHandshakeWrapper.java +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/auth2/ClientHandshakeWrapper.java @@ -61,7 +61,7 @@ public static void doClientHandshake(FlightServiceStub stub) { throw wrappedException; } } catch (StatusRuntimeException sre) { - logger.error("Failed with SREe", sre); + logger.error("Failed with SRE", sre); throw StatusUtils.fromGrpcRuntimeException(sre); } catch (Throwable ex) { logger.error("Failed with unknown", ex); diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/ArrowDatabaseMetadata.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/ArrowDatabaseMetadata.java index 3487e58a64678..d68b8070e2bb7 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/ArrowDatabaseMetadata.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/ArrowDatabaseMetadata.java @@ -1048,8 +1048,8 @@ private int setGetColumnsVectorSchemaRootFromFields(final VectorSchemaRoot curre SqlTypes.getSqlTypeNameFromArrowType(fieldType).getBytes(CHARSET); typeNameVector.setSafe(insertIndex, typeName); - // We're not setting COLUMN_SIZE for ROWID SQL Types, as there's no such Arrow type. - // We're not setting COLUMN_SIZE nor DECIMAL_DIGITS for Float/Double as their precision and scale are variable. + // We aren't setting COLUMN_SIZE for ROWID SQL Types, as there's no such Arrow type. + // We aren't setting COLUMN_SIZE nor DECIMAL_DIGITS for Float/Double as their precision and scale are variable. if (fieldType instanceof ArrowType.Decimal) { numPrecRadixVector.setSafe(insertIndex, BASE10_RADIX); } else if (fieldType instanceof ArrowType.Int) { @@ -1101,7 +1101,7 @@ private static byte[] booleanToYesOrNo(boolean autoIncrement) { } static Integer getDecimalDigits(final ArrowType fieldType) { - // We're not setting DECIMAL_DIGITS for Float/Double as their precision and scale are variable. + // We aren't setting DECIMAL_DIGITS for Float/Double as their precision and scale are variable. if (fieldType instanceof ArrowType.Decimal) { final ArrowType.Decimal thisDecimal = (ArrowType.Decimal) fieldType; return thisDecimal.getScale(); @@ -1141,8 +1141,8 @@ static Integer getDecimalDigits(final ArrowType fieldType) { } static Integer getColumnSize(final ArrowType fieldType) { - // We're not setting COLUMN_SIZE for ROWID SQL Types, as there's no such Arrow type. - // We're not setting COLUMN_SIZE nor DECIMAL_DIGITS for Float/Double as their precision and scale are variable. + // We aren't setting COLUMN_SIZE for ROWID SQL Types, as there's no such Arrow type. + // We aren't setting COLUMN_SIZE nor DECIMAL_DIGITS for Float/Double as their precision and scale are variable. if (fieldType instanceof ArrowType.Decimal) { final ArrowType.Decimal thisDecimal = (ArrowType.Decimal) fieldType; return thisDecimal.getPrecision(); diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/client/ArrowFlightSqlClientHandler.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/client/ArrowFlightSqlClientHandler.java index 54fd17853c00b..234820bd41823 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/client/ArrowFlightSqlClientHandler.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/client/ArrowFlightSqlClientHandler.java @@ -469,8 +469,8 @@ public static final class Builder { @VisibleForTesting boolean retainAuth = true; - // These two middlewares are for internal use within build() and should not be exposed by builder APIs. - // Note that these middlewares may not necessarily be registered. + // These two middleware are for internal use within build() and should not be exposed by builder APIs. + // Note that these middleware may not necessarily be registered. @VisibleForTesting ClientIncomingAuthHeaderMiddleware.Factory authFactory = new ClientIncomingAuthHeaderMiddleware.Factory(new ClientBearerHeaderHandler()); @@ -742,7 +742,7 @@ public Builder withCallOptions(final Collection options) { * @throws SQLException on error. */ public ArrowFlightSqlClientHandler build() throws SQLException { - // Copy middlewares so that the build method doesn't change the state of the builder fields itself. + // Copy middleware so that the build method doesn't change the state of the builder fields itself. Set buildTimeMiddlewareFactories = new HashSet<>(this.middlewareFactories); FlightClient client = null; boolean isUsingUserPasswordAuth = username != null && token == null; diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/AvaticaParameterBinder.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/AvaticaParameterBinder.java index 5fa3ba38f2506..b2bd8e745ecca 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/AvaticaParameterBinder.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/AvaticaParameterBinder.java @@ -94,7 +94,7 @@ public void bind(List typedValues, int index) { } /** - * Bind a TypedValue to the given index on the FieldVctor. + * Bind a TypedValue to the given index on the FieldVector. * * @param vector FieldVector to bind to. * @param typedValue TypedValue to bind to the vector. diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/VectorSchemaRootTransformer.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/VectorSchemaRootTransformer.java index 3bab918c83aab..52a1d7db791c2 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/VectorSchemaRootTransformer.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/VectorSchemaRootTransformer.java @@ -74,7 +74,7 @@ public Builder renameFieldVector(final String originalVectorName, final ArrowType transformedType = transformedVector.getField().getType(); if (!originalType.equals(transformedType)) { throw new IllegalArgumentException(String.format( - "Can not transfer vector with field type %s to %s", originalType, transformedType)); + "Cannot transfer vector with field type %s to %s", originalType, transformedType)); } if (originalVector instanceof BaseVariableWidthVector) { @@ -85,7 +85,7 @@ public Builder renameFieldVector(final String originalVectorName, ((BaseFixedWidthVector) transformedVector)); } else { throw new IllegalStateException(String.format( - "Can not transfer vector of type %s", originalVector.getClass())); + "Cannot transfer vector of type %s", originalVector.getClass())); } }); diff --git a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/ArrowFlightJdbcDriverTest.java b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/ArrowFlightJdbcDriverTest.java index 9b8fa96d2320e..784fd5b292b27 100644 --- a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/ArrowFlightJdbcDriverTest.java +++ b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/ArrowFlightJdbcDriverTest.java @@ -142,7 +142,7 @@ public void testConnectWithInsensitiveCasePropertyKeys() throws Exception { driver.connect("jdbc:arrow-flight://" + dataSource.getConfig().getHost() + ":" + dataSource.getConfig().getPort() + "?" + - "UseEncryptiOn=false", + "UseEncryptIon=false", dataSource.getProperties(dataSource.getConfig().getUser(), dataSource.getConfig().getPassword()))) { assertTrue(connection.isValid(300)); } @@ -150,7 +150,7 @@ public void testConnectWithInsensitiveCasePropertyKeys() throws Exception { driver.connect("jdbc:arrow-flight-sql://" + dataSource.getConfig().getHost() + ":" + dataSource.getConfig().getPort() + "?" + - "UseEncryptiOn=false", + "UseEncryptIon=false", dataSource.getProperties(dataSource.getConfig().getUser(), dataSource.getConfig().getPassword()))) { assertTrue(connection.isValid(300)); } @@ -162,7 +162,7 @@ public void testConnectWithInsensitiveCasePropertyKeys2() throws Exception { final Driver driver = new ArrowFlightJdbcDriver(); Properties properties = dataSource.getProperties(dataSource.getConfig().getUser(), dataSource.getConfig().getPassword()); - properties.put("UseEncryptiOn", "false"); + properties.put("UseEncryptIon", "false"); try (Connection connection = driver.connect("jdbc:arrow-flight://" + diff --git a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/utils/MockFlightSqlProducer.java b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/utils/MockFlightSqlProducer.java index f36956f193ce8..c165bfb7ce336 100644 --- a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/utils/MockFlightSqlProducer.java +++ b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/utils/MockFlightSqlProducer.java @@ -195,7 +195,7 @@ void addUpdateQuery(final String sqlCommand, final BiConsumer> resultsProvider) { Preconditions.checkState( updateResultProviders.putIfAbsent(sqlCommand, resultsProvider) == null, - format("Attempted to overwrite pre-existing query: <%s>.", sqlCommand)); + format("Attempted to overwrite preexisting query: <%s>.", sqlCommand)); } /** Registers parameters expected to be provided with a prepared statement. */ diff --git a/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/FlightSqlColumnMetadata.java b/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/FlightSqlColumnMetadata.java index bd52e4b495e6e..186e8bc04ec9c 100644 --- a/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/FlightSqlColumnMetadata.java +++ b/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/FlightSqlColumnMetadata.java @@ -145,8 +145,8 @@ public Boolean isAutoIncrement() { } /** - * Returns if the column is case sensitive. - * @return True if the column is case sensitive, false otherwise. + * Returns if the column is case-sensitive. + * @return True if the column is case-sensitive, false otherwise. */ public Boolean isCaseSensitive() { String value = metadataMap.get(IS_CASE_SENSITIVE); @@ -267,8 +267,8 @@ public Builder isAutoIncrement(boolean isAutoIncrement) { } /** - * Sets if the column is case sensitive. - * @param isCaseSensitive If the column is case sensitive. + * Sets if the column is case-sensitive. + * @param isCaseSensitive If the column is case-sensitive. * @return This builder. */ public Builder isCaseSensitive(boolean isCaseSensitive) { diff --git a/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/util/TableRef.java b/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/util/TableRef.java index 315f17ee911cf..b3751cab9038a 100644 --- a/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/util/TableRef.java +++ b/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/util/TableRef.java @@ -66,7 +66,7 @@ public String getDbSchema() { } /** - * Retreives the table from the object. + * Retrieves the table from the object. * @return the table. */ public String getTable() { diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/JniLoader.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/JniLoader.java index 5158d52f8c998..2528989f3784b 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/JniLoader.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/JniLoader.java @@ -158,10 +158,10 @@ static long getDefaultConfiguration() throws GandivaException { synchronized (ConfigurationBuilder.class) { if (defaultConfiguration == 0L) { JniLoader.getInstance(); // setup - ConfigurationBuilder.ConfigOptions defaultConfigOptons = ConfigurationBuilder.ConfigOptions.getDefault(); + ConfigurationBuilder.ConfigOptions defaultConfigOptions = ConfigurationBuilder.ConfigOptions.getDefault(); defaultConfiguration = new ConfigurationBuilder() - .buildConfigInstance(defaultConfigOptons); - configurationMap.put(defaultConfigOptons, defaultConfiguration); + .buildConfigInstance(defaultConfigOptions); + configurationMap.put(defaultConfigOptions, defaultConfiguration); } } } diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/InNode.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/InNode.java index 0f8de962869d8..bb1391b4001ea 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/InNode.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/InNode.java @@ -64,7 +64,7 @@ private InNode(Set values, Set longValues, Set stringValu * * @param node Node with the 'IN' clause. * @param intValues Int values to build the IN node. - * @retur InNode referring to tree node. + * @return InNode referring to tree node. */ public static InNode makeIntInExpr(TreeNode node, Set intValues) { return new InNode(intValues, @@ -77,7 +77,7 @@ public static InNode makeIntInExpr(TreeNode node, Set intValues) { * * @param node Node with the 'IN' clause. * @param longValues Long values to build the IN node. - * @retur InNode referring to tree node. + * @return InNode referring to tree node. */ public static InNode makeLongInExpr(TreeNode node, Set longValues) { return new InNode(null, longValues, @@ -90,7 +90,7 @@ public static InNode makeLongInExpr(TreeNode node, Set longValues) { * * @param node Node with the 'IN' clause. * @param floatValues Float values to build the IN node. - * @retur InNode referring to tree node. + * @return InNode referring to tree node. */ public static InNode makeFloatInExpr(TreeNode node, Set floatValues) { return new InNode(null, null, null, null, null, null, @@ -102,7 +102,7 @@ public static InNode makeFloatInExpr(TreeNode node, Set floatValues) { * * @param node Node with the 'IN' clause. * @param doubleValues Double values to build the IN node. - * @retur InNode referring to tree node. + * @return InNode referring to tree node. */ public static InNode makeDoubleInExpr(TreeNode node, Set doubleValues) { return new InNode(null, null, null, null, null, diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/AllocationReservation.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/AllocationReservation.java index 4331eb20ca3b6..c672dc48d79ca 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/AllocationReservation.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/AllocationReservation.java @@ -59,7 +59,7 @@ public interface AllocationReservation extends AutoCloseable { * requested is available, if the allocation cannot be made contiguously.

    * * @return the buffer, or null, if the request cannot be satisfied - * @throws IllegalStateException if called called more than once + * @throws IllegalStateException if called more than once */ ArrowBuf allocateBuffer(); diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ArrowBuf.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ArrowBuf.java index 5b322b4ff566b..2c2e93b2d70ce 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ArrowBuf.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ArrowBuf.java @@ -1210,7 +1210,7 @@ public ArrowBuf setOne(long index, long length) { } /** - * Returns this if size is less then {@link #capacity()}, otherwise + * Returns this if size is less than {@link #capacity()}, otherwise * delegates to {@link BufferManager#replace(ArrowBuf, long)} to get a new buffer. */ public ArrowBuf reallocIfNeeded(final long size) { diff --git a/java/vector/src/main/codegen/templates/BaseWriter.java b/java/vector/src/main/codegen/templates/BaseWriter.java index 3b35d22692e68..35df256b324b5 100644 --- a/java/vector/src/main/codegen/templates/BaseWriter.java +++ b/java/vector/src/main/codegen/templates/BaseWriter.java @@ -117,7 +117,11 @@ public interface StructOrListWriter { void start(); void end(); StructOrListWriter struct(String name); + /** + * @deprecated use {@link #listOfStruct()} instead. + */ StructOrListWriter listoftstruct(String name); + StructOrListWriter listOfStruct(String name); StructOrListWriter list(String name); boolean isStructWriter(); boolean isListWriter(); diff --git a/java/vector/src/main/codegen/templates/DenseUnionVector.java b/java/vector/src/main/codegen/templates/DenseUnionVector.java index de0cf84fd82ad..c23caf3bb5a03 100644 --- a/java/vector/src/main/codegen/templates/DenseUnionVector.java +++ b/java/vector/src/main/codegen/templates/DenseUnionVector.java @@ -662,7 +662,7 @@ public void splitAndTransfer(int startIndex, int length) { ReferenceManager refManager = slicedBuffer.getReferenceManager(); to.typeBuffer = refManager.transferOwnership(slicedBuffer, to.allocator).getTransferredBuffer(); - // transfer offset byffer + // transfer offset buffer while (to.offsetBuffer.capacity() < (long) length * OFFSET_WIDTH) { to.reallocOffsetBuffer(); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java index d09664e6d313e..90229460111c3 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java @@ -478,7 +478,7 @@ public List getFieldInnerVectors() { @Override public void initializeChildrenFromFields(List children) { if (!children.isEmpty()) { - throw new IllegalArgumentException("primitive type vector can not have children"); + throw new IllegalArgumentException("primitive type vector cannot have children"); } } @@ -608,7 +608,7 @@ public TransferPair getTransferPair(BufferAllocator allocator) { public abstract TransferPair getTransferPair(Field field, BufferAllocator allocator); /** - * Transfer this vector'data to another vector. The memory associated + * Transfer this vector's data to another vector. The memory associated * with this vector is transferred to the allocator of target vector * for accounting and management purposes. * @param target destination vector for transfer diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java index fcac28bd08470..a77278138f28c 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java @@ -276,7 +276,7 @@ public List getFieldInnerVectors() { @Override public void initializeChildrenFromFields(List children) { if (!children.isEmpty()) { - throw new IllegalArgumentException("primitive type vector can not have children"); + throw new IllegalArgumentException("primitive type vector cannot have children"); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java index a0a5e085a5a8a..46bc9815f037a 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java @@ -295,7 +295,7 @@ public List getFieldInnerVectors() { @Override public void initializeChildrenFromFields(List children) { if (!children.isEmpty()) { - throw new IllegalArgumentException("primitive type vector can not have children"); + throw new IllegalArgumentException("primitive type vector cannot have children"); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/Decimal256Vector.java b/java/vector/src/main/java/org/apache/arrow/vector/Decimal256Vector.java index 79a9badc3955d..fe650c7d28074 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/Decimal256Vector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/Decimal256Vector.java @@ -212,7 +212,7 @@ public void set(int index, ArrowBuf buffer) { * ArrowBuf of decimal vector. * *

    This method takes care of adding the necessary padding if the length - * of byte array is less then 32 (length of decimal type). + * of byte array is less than 32 (length of decimal type). * * @param index position of element * @param value array of bytes containing decimal in big endian byte order. diff --git a/java/vector/src/main/java/org/apache/arrow/vector/DecimalVector.java b/java/vector/src/main/java/org/apache/arrow/vector/DecimalVector.java index d1a3bfc3afb10..7c3662c86748b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/DecimalVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/DecimalVector.java @@ -159,7 +159,7 @@ public BigDecimal getObject(int index) { } /** - * Same as {@link #getObect(int)} but does not check for null. + * Same as {@link #getObject(int)} but does not check for null. * * @param index position of element * @return element at given index @@ -211,7 +211,7 @@ public void set(int index, ArrowBuf buffer) { * ArrowBuf of decimal vector. * *

    This method takes care of adding the necessary padding if the length - * of byte array is less then 16 (length of decimal type). + * of byte array is less than 16 (length of decimal type). * * @param index position of element * @param value array of bytes containing decimal in big endian byte order. diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ExtensionTypeVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ExtensionTypeVector.java index a70efe61bcdfe..3a35a44403492 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ExtensionTypeVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ExtensionTypeVector.java @@ -50,7 +50,7 @@ public abstract class ExtensionTypeVector e */ public ExtensionTypeVector(String name, BufferAllocator allocator, T underlyingVector) { super(allocator); - Preconditions.checkNotNull(underlyingVector, "underlyingVector can not be null."); + Preconditions.checkNotNull(underlyingVector, "underlyingVector cannot be null."); this.name = name; this.underlyingVector = underlyingVector; } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/IntervalMonthDayNanoVector.java b/java/vector/src/main/java/org/apache/arrow/vector/IntervalMonthDayNanoVector.java index 73bbc0a2c19f2..fc0aa9d27b1c3 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/IntervalMonthDayNanoVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/IntervalMonthDayNanoVector.java @@ -39,7 +39,7 @@ * A validity buffer (bit vector) is maintained to track which elements in the * vector are null. * - * Month, day and nanoseconds are indepndent from one another and there + * Month, day and nanoseconds are independent from one another and there * is no specific limits imposed on their values. */ public final class IntervalMonthDayNanoVector extends BaseFixedWidthVector { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java b/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java index ed5f3aef17397..510cef24c7e16 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java @@ -96,7 +96,7 @@ private void loadBuffers( Iterator buffers, Iterator nodes, CompressionCodec codec) { - checkArgument(nodes.hasNext(), "no more field nodes for for field %s and vector %s", field, vector); + checkArgument(nodes.hasNext(), "no more field nodes for field %s and vector %s", field, vector); ArrowFieldNode fieldNode = nodes.next(); int bufferLayoutCount = TypeLayout.getTypeBufferCount(field.getType()); List ownBuffers = new ArrayList<>(bufferLayoutCount); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java index d0304a6fd2504..27db1574808a3 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java @@ -100,7 +100,7 @@ public StructVector(String name, * @param fieldType The type of this list. * @param callBack A schema change callback. * @param conflictPolicy policy to determine how duplicate names are handled. - * @param allowConflictPolicyChanges wether duplicate names are allowed at all. + * @param allowConflictPolicyChanges whether duplicate names are allowed at all. */ public StructVector(String name, BufferAllocator allocator, @@ -139,7 +139,7 @@ public StructVector(Field field, * @param allocator The allocator to use to allocating/reallocating buffers. * @param callBack A schema change callback. * @param conflictPolicy policy to determine how duplicate names are handled. - * @param allowConflictPolicyChanges wether duplicate names are allowed at all. + * @param allowConflictPolicyChanges whether duplicate names are allowed at all. */ public StructVector(Field field, BufferAllocator allocator, diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/ComplexWriterImpl.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/ComplexWriterImpl.java index 8d2694b6df887..8dd5763990fa8 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/ComplexWriterImpl.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/ComplexWriterImpl.java @@ -49,7 +49,7 @@ private enum Mode { INIT, STRUCT, LIST, MAP } * @param name The name of the writer (for tracking). * @param container A container for the data field to be written. * @param unionEnabled Unused. - * @param caseSensitive Whether field names are case sensitive (if false field names will be lowercase. + * @param caseSensitive Whether field names are case-sensitive (if false field names will be lowercase. */ public ComplexWriterImpl( String name, diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/NullableStructWriterFactory.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/NullableStructWriterFactory.java index 458aa7b610147..a305529b71fa8 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/NullableStructWriterFactory.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/NullableStructWriterFactory.java @@ -21,7 +21,7 @@ /** * A factory for {@link NullableStructWriter} instances. The factory allows for configuring if field - * names should be considered case sensitive. + * names should be considered case-sensitive. */ public class NullableStructWriterFactory { private final boolean caseSensitive; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/StructOrListWriterImpl.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/StructOrListWriterImpl.java index e9c0825dd3d49..5c4cd2af98d55 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/StructOrListWriterImpl.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/StructOrListWriterImpl.java @@ -87,8 +87,19 @@ public StructOrListWriter struct(final String name) { * Creates a new writer for a list of structs. * * @param name Unused. + * + * @deprecated use {@link #listOfStruct()} instead. */ public StructOrListWriter listoftstruct(final String name) { + return listOfStruct(name); + } + + /** + * Creates a new writer for a list of structs. + * + * @param name Unused. + */ + public StructOrListWriter listOfStruct(final String name) { assert list != null; return new StructOrListWriterImpl(list.struct()); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java b/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java index a81169b8f7d73..0dfb61dcdf269 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java @@ -94,7 +94,7 @@ public static boolean checkPrecisionAndScale(BigDecimal value, int vectorPrecisi value.scale() + " != " + vectorScale); } if (value.precision() > vectorPrecision) { - throw new UnsupportedOperationException("BigDecimal precision can not be greater than that in the Arrow " + + throw new UnsupportedOperationException("BigDecimal precision cannot be greater than that in the Arrow " + "vector: " + value.precision() + " > " + vectorPrecision); } return true; @@ -120,7 +120,7 @@ public static boolean checkPrecisionAndScale(int decimalPrecision, int decimalSc decimalScale + " != " + vectorScale); } if (decimalPrecision > vectorPrecision) { - throw new UnsupportedOperationException("BigDecimal precision can not be greater than that in the Arrow " + + throw new UnsupportedOperationException("BigDecimal precision cannot be greater than that in the Arrow " + "vector: " + decimalPrecision + " > " + vectorPrecision); } return true; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java b/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java index c5de380f9c173..068717c7acbc7 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java @@ -506,7 +506,7 @@ public ValueVector visit(DenseUnionVector deltaVector, Void value) { targetChildVector = targetDenseUnionVector.addVector( (byte) i, deltaChildVector.getField().createVector(targetDenseUnionVector.getAllocator())); - // now we have both child vecors not null, we can append them. + // now we have both child vectors not null, we can append them. VectorAppender childAppender = new VectorAppender(targetChildVector); deltaChildVector.accept(childAppender, null); } else if (targetChildVector != null && deltaChildVector == null) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorTypeVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorTypeVisitor.java index 08e3ccccfa29a..3d1c5a4f27f7c 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorTypeVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorTypeVisitor.java @@ -280,7 +280,7 @@ public Void visit(BaseFixedWidthVector vector, Void value) { validateOrThrow(arrowType.getByteWidth() > 0, "The byte width of a FixedSizeBinaryVector %s is not positive.", arrowType.getByteWidth()); validateOrThrow(arrowType.getByteWidth() == vector.getTypeWidth(), - "Type width mismatch for FixedSizeBinaryVector. Vector type width %s, arrow type type width %s.", + "Type width mismatch for FixedSizeBinaryVector. Vector type width %s, arrow type width %s.", vector.getTypeWidth(), arrowType.getByteWidth()); } else { throw new IllegalArgumentException("Unknown type for fixed width vector " + vector.getClass()); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestDecimal256Vector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestDecimal256Vector.java index 51368cf6aea35..b703959d2bb1e 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestDecimal256Vector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestDecimal256Vector.java @@ -104,7 +104,7 @@ public void testDecimal256DifferentScaleAndPrecision() { BigDecimal decimal = new BigDecimal(BigInteger.valueOf(12345), 2); UnsupportedOperationException ue = assertThrows(UnsupportedOperationException.class, () -> decimalVector.setSafe(0, decimal)); - assertEquals("BigDecimal precision can not be greater than that in the Arrow vector: 5 > 4", ue.getMessage()); + assertEquals("BigDecimal precision cannot be greater than that in the Arrow vector: 5 > 4", ue.getMessage()); } } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestDenseUnionVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestDenseUnionVector.java index 01becf00794ee..9cb12481612b2 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestDenseUnionVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestDenseUnionVector.java @@ -610,9 +610,9 @@ public void testChildVectorValueCounts() { assertEquals(8L, longVector.get(0)); assertEquals(12L, longVector.get(1)); - Float4Vector floagVector = (Float4Vector) vector.getVectorByType(floatTypeId); - assertEquals(1, floagVector.getValueCount()); - assertEquals(9.0f, floagVector.get(0), 0); + Float4Vector floatVector = (Float4Vector) vector.getVectorByType(floatTypeId); + assertEquals(1, floatVector.getValueCount()); + assertEquals(9.0f, floatVector.get(0), 0); } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java index 18bb2c95738a4..7d5701ddb765b 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java @@ -255,7 +255,7 @@ public void testVarCharAllocateNew() throws Exception { try (final VarCharVector vector = new VarCharVector("", allocator)) { vector.allocateNew(count); - // verify that the validity buffer and value buffer have capacity for atleast 'count' elements. + // verify that the validity buffer and value buffer have capacity for at least 'count' elements. Assert.assertTrue(vector.getValidityBuffer().capacity() >= DataSizeRoundingUtil.divideBy8Ceil(count)); Assert.assertTrue(vector.getOffsetBuffer().capacity() >= (count + 1) * BaseVariableWidthVector.OFFSET_WIDTH); } @@ -268,7 +268,7 @@ public void testLargeVarCharAllocateNew() throws Exception { try (final LargeVarCharVector vector = new LargeVarCharVector("", allocator)) { vector.allocateNew(count); - // verify that the validity buffer and value buffer have capacity for atleast 'count' elements. + // verify that the validity buffer and value buffer have capacity for at least 'count' elements. Assert.assertTrue(vector.getValidityBuffer().capacity() >= DataSizeRoundingUtil.divideBy8Ceil(count)); Assert.assertTrue(vector.getOffsetBuffer().capacity() >= (count + 1) * BaseLargeVariableWidthVector.OFFSET_WIDTH); } @@ -281,7 +281,7 @@ public void testVarCharAllocateNewUsingHelper() throws Exception { try (final VarCharVector vector = new VarCharVector("", allocator)) { AllocationHelper.allocateNew(vector, count); - // verify that the validity buffer and value buffer have capacity for atleast 'count' elements. + // verify that the validity buffer and value buffer have capacity for at least 'count' elements. Assert.assertTrue(vector.getValidityBuffer().capacity() >= DataSizeRoundingUtil.divideBy8Ceil(count)); Assert.assertTrue(vector.getOffsetBuffer().capacity() >= (count + 1) * BaseVariableWidthVector.OFFSET_WIDTH); } @@ -294,7 +294,7 @@ public void testLargeVarCharAllocateNewUsingHelper() throws Exception { try (final LargeVarCharVector vector = new LargeVarCharVector("", allocator)) { AllocationHelper.allocateNew(vector, count); - // verify that the validity buffer and value buffer have capacity for atleast 'count' elements. + // verify that the validity buffer and value buffer have capacity for at least 'count' elements. Assert.assertTrue(vector.getValidityBuffer().capacity() >= DataSizeRoundingUtil.divideBy8Ceil(count)); Assert.assertTrue(vector.getOffsetBuffer().capacity() >= (count + 1) * BaseLargeVariableWidthVector.OFFSET_WIDTH); } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/ipc/MessageSerializerTest.java b/java/vector/src/test/java/org/apache/arrow/vector/ipc/MessageSerializerTest.java index 11b8d4fadd164..79a4b249a8a89 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/ipc/MessageSerializerTest.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/ipc/MessageSerializerTest.java @@ -108,7 +108,7 @@ public void testWriteMessageBufferAligned() throws IOException { // First message continuation, size, and 2 int values assertEquals(MessageSerializer.IPC_CONTINUATION_TOKEN, result.getInt()); - // mesage length is represented in little endian + // message length is represented in little endian result.order(ByteOrder.LITTLE_ENDIAN); assertEquals(8, result.getInt()); result.order(ByteOrder.nativeOrder()); @@ -117,7 +117,7 @@ public void testWriteMessageBufferAligned() throws IOException { // Second message continuation, size, 1 int value and 4 bytes padding assertEquals(MessageSerializer.IPC_CONTINUATION_TOKEN, result.getInt()); - // mesage length is represented in little endian + // message length is represented in little endian result.order(ByteOrder.LITTLE_ENDIAN); assertEquals(8, result.getInt()); result.order(ByteOrder.nativeOrder()); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestExtensionType.java b/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestExtensionType.java index 1b3d5eee35f88..084350410a4f5 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestExtensionType.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestExtensionType.java @@ -177,7 +177,7 @@ public void testNullCheck() { vector.allocateNewSafe(); } }); - assertTrue(e.getMessage().contains("underlyingVector can not be null.")); + assertTrue(e.getMessage().contains("underlyingVector cannot be null.")); } /** From a4fae0230693f382d99910273c0c983ea3bc933a Mon Sep 17 00:00:00 2001 From: Laurent Goujon Date: Thu, 14 Dec 2023 06:38:02 -0800 Subject: [PATCH 046/570] GH-39037: [Java] Remove (Contrib/Experimental) mention in Flight SQL (#39040) ### Rationale for this change Considering that Flight SQL has been present for a while and should be fairly stable, remove the `(Contrib/Experimental)` mention from the pom file which also shows up on Maven Central UI pages ### Are these changes tested? Local test but there's no code change, only cosmetic ### Are there any user-facing changes? None * Closes: #39037 Authored-by: Laurent Goujon Signed-off-by: David Li --- java/flight/flight-sql-jdbc-core/pom.xml | 2 +- java/flight/flight-sql-jdbc-driver/pom.xml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/java/flight/flight-sql-jdbc-core/pom.xml b/java/flight/flight-sql-jdbc-core/pom.xml index cbeaa88f1e2f7..74a2f8d320f37 100644 --- a/java/flight/flight-sql-jdbc-core/pom.xml +++ b/java/flight/flight-sql-jdbc-core/pom.xml @@ -23,7 +23,7 @@ flight-sql-jdbc-core Arrow Flight SQL JDBC Driver Core - (Contrib/Experimental) Core implementation of JDBC driver based on Arrow Flight SQL. + Core implementation of JDBC driver based on Arrow Flight SQL. jar https://arrow.apache.org diff --git a/java/flight/flight-sql-jdbc-driver/pom.xml b/java/flight/flight-sql-jdbc-driver/pom.xml index 84462f54950ba..d4ef1b4ea3b9b 100644 --- a/java/flight/flight-sql-jdbc-driver/pom.xml +++ b/java/flight/flight-sql-jdbc-driver/pom.xml @@ -23,7 +23,7 @@ flight-sql-jdbc-driver Arrow Flight SQL JDBC Driver - (Contrib/Experimental) A JDBC driver based on Arrow Flight SQL. + A JDBC driver based on Arrow Flight SQL. jar https://arrow.apache.org From 4e58f7ca0016c2b2d8a859a0c5965df3b15523e0 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 14 Dec 2023 15:25:28 -0300 Subject: [PATCH 047/570] GH-39119: [C++] Refactor the Azure FS tests and filesystem class instantiation (#39207) ### Rationale for this change This PR contains some unrelated improvements (like better docs) and some nitpicky fixes. The test refactoring makes it easier to see on which environments tests run and make them able to be instantiated with different options in the future once we extend the `AzureOptions`. ### What changes are included in this PR? - Random cleanups - Short namespace aliases - Refactoring of the tests (multiple environments, TYPED_TEST_SUITE, explicit preexisting data setup) - Cleanup of the `AzureOptions` class ### Are these changes tested? Yes. I created Azure Storage accounts to test with and without Hierarchical Namespace support. I also ran the tests in a shell without my environment variables to ensure the test-skipping behavior is correct. ### Are there any user-facing changes? Changes to `AzureOptions`, but since `AzureFileSystem` is not really used yet, these breaking changes won't be a problem. * Closes: #39119 Authored-by: Felipe Oliveira Carvalho Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/filesystem/azurefs.cc | 559 ++++---- cpp/src/arrow/filesystem/azurefs.h | 150 ++- cpp/src/arrow/filesystem/azurefs_internal.cc | 6 + cpp/src/arrow/filesystem/azurefs_internal.h | 3 - cpp/src/arrow/filesystem/azurefs_test.cc | 1269 ++++++++++-------- 5 files changed, 1094 insertions(+), 893 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 824a8fb531483..217885364089b 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -33,42 +33,85 @@ #include "arrow/util/logging.h" #include "arrow/util/string.h" -namespace arrow { -namespace fs { +namespace arrow::fs { + +namespace Blobs = Azure::Storage::Blobs; +namespace Core = Azure::Core; +namespace DataLake = Azure::Storage::Files::DataLake; +namespace Http = Azure::Core::Http; +namespace Storage = Azure::Storage; // ----------------------------------------------------------------------- // AzureOptions Implementation AzureOptions::AzureOptions() = default; +AzureOptions::~AzureOptions() = default; + bool AzureOptions::Equals(const AzureOptions& other) const { - return (account_dfs_url == other.account_dfs_url && - account_blob_url == other.account_blob_url && - credentials_kind == other.credentials_kind && - default_metadata == other.default_metadata); + // TODO(GH-38598): update here when more auth methods are added. + const bool equals = backend == other.backend && + default_metadata == other.default_metadata && + account_blob_url_ == other.account_blob_url_ && + account_dfs_url_ == other.account_dfs_url_ && + credential_kind_ == other.credential_kind_; + if (!equals) { + return false; + } + switch (credential_kind_) { + case CredentialKind::kAnonymous: + return true; + case CredentialKind::kStorageSharedKeyCredential: + return storage_shared_key_credential_->AccountName == + other.storage_shared_key_credential_->AccountName; + } + DCHECK(false); + return false; } -Status AzureOptions::ConfigureAccountKeyCredentials(const std::string& account_name, - const std::string& account_key) { - if (this->backend == AzureBackend::Azurite) { - account_blob_url = "http://127.0.0.1:10000/" + account_name + "/"; - account_dfs_url = "http://127.0.0.1:10000/" + account_name + "/"; +Status AzureOptions::ConfigureAccountKeyCredential(const std::string& account_name, + const std::string& account_key) { + if (this->backend == AzureBackend::kAzurite) { + account_blob_url_ = "http://127.0.0.1:10000/" + account_name + "/"; + account_dfs_url_ = "http://127.0.0.1:10000/" + account_name + "/"; } else { - account_dfs_url = "https://" + account_name + ".dfs.core.windows.net/"; - account_blob_url = "https://" + account_name + ".blob.core.windows.net/"; + account_dfs_url_ = "https://" + account_name + ".dfs.core.windows.net/"; + account_blob_url_ = "https://" + account_name + ".blob.core.windows.net/"; } - storage_credentials_provider = - std::make_shared(account_name, - account_key); - credentials_kind = AzureCredentialsKind::StorageCredentials; + credential_kind_ = CredentialKind::kStorageSharedKeyCredential; + storage_shared_key_credential_ = + std::make_shared(account_name, account_key); return Status::OK(); } +Result> AzureOptions::MakeBlobServiceClient() + const { + switch (credential_kind_) { + case CredentialKind::kAnonymous: + break; + case CredentialKind::kStorageSharedKeyCredential: + return std::make_unique(account_blob_url_, + storage_shared_key_credential_); + } + return Status::Invalid("AzureOptions doesn't contain a valid auth configuration"); +} + +Result> +AzureOptions::MakeDataLakeServiceClient() const { + switch (credential_kind_) { + case CredentialKind::kAnonymous: + break; + case CredentialKind::kStorageSharedKeyCredential: + return std::make_unique( + account_dfs_url_, storage_shared_key_credential_); + } + return Status::Invalid("AzureOptions doesn't contain a valid auth configuration"); +} + namespace { -// An AzureFileSystem represents a single Azure storage -// account. AzureLocation describes a container and path within -// that storage account. +// An AzureFileSystem represents an Azure storage account. An AzureLocation describes a +// container in that storage account and a path within that container. struct AzureLocation { std::string all; std::string container; @@ -82,8 +125,8 @@ struct AzureLocation { // path_parts = [testdir, testfile.txt] if (internal::IsLikelyUri(string)) { return Status::Invalid( - "Expected an Azure object location of the form 'container/path...', got a URI: " - "'", + "Expected an Azure object location of the form 'container/path...'," + " got a URI: '", string, "'"); } auto first_sep = string.find_first_of(internal::kSep); @@ -130,14 +173,15 @@ struct AzureLocation { private: Status Validate() { auto status = internal::ValidateAbstractPathParts(path_parts); - if (!status.ok()) { - return Status::Invalid(status.message(), " in location ", all); - } else { - return status; - } + return status.ok() ? status : Status::Invalid(status.message(), " in location ", all); } }; +Status ExceptionToStatus(const std::string& prefix, + const Azure::Storage::StorageException& exception) { + return Status::IOError(prefix, " Azure Error: ", exception.what()); +} + Status PathNotFound(const AzureLocation& location) { return ::arrow::fs::internal::PathNotFound(location.all); } @@ -153,23 +197,41 @@ Status ValidateFileLocation(const AzureLocation& location) { if (location.path.empty()) { return NotAFile(location); } - ARROW_RETURN_NOT_OK(internal::AssertNoTrailingSlash(location.path)); - return Status::OK(); + return internal::AssertNoTrailingSlash(location.path); +} + +std::string_view BodyTextView(const Http::RawResponse& raw_response) { + const auto& body = raw_response.GetBody(); +#ifndef NDEBUG + auto& headers = raw_response.GetHeaders(); + auto content_type = headers.find("Content-Type"); + if (content_type != headers.end()) { + DCHECK_EQ(std::string_view{content_type->second}.substr(5), "text/"); + } +#endif + return std::string_view{reinterpret_cast(body.data()), body.size()}; } Status StatusFromErrorResponse(const std::string& url, - Azure::Core::Http::RawResponse* raw_response, + const Http::RawResponse& raw_response, const std::string& context) { - const auto& body = raw_response->GetBody(); // There isn't an Azure specification that response body on error // doesn't contain any binary data but we assume it. We hope that // error response body has useful information for the error. - std::string_view body_text(reinterpret_cast(body.data()), body.size()); - return Status::IOError(context, ": ", url, ": ", raw_response->GetReasonPhrase(), " (", - static_cast(raw_response->GetStatusCode()), + auto body_text = BodyTextView(raw_response); + return Status::IOError(context, ": ", url, ": ", raw_response.GetReasonPhrase(), " (", + static_cast(raw_response.GetStatusCode()), "): ", body_text); } +bool IsContainerNotFound(const Storage::StorageException& exception) { + if (exception.ErrorCode == "ContainerNotFound") { + DCHECK_EQ(exception.StatusCode, Http::HttpStatusCode::NotFound); + return true; + } + return false; +} + template std::string FormatValue(typename TypeTraits::CType value) { struct StringAppender { @@ -185,7 +247,7 @@ std::string FormatValue(typename TypeTraits::CType value) { } std::shared_ptr PropertiesToMetadata( - const Azure::Storage::Blobs::Models::BlobProperties& properties) { + const Blobs::Models::BlobProperties& properties) { auto metadata = std::make_shared(); // Not supported yet: // * properties.ObjectReplicationSourceProperties @@ -316,7 +378,7 @@ std::shared_ptr PropertiesToMetadata( class ObjectInputFile final : public io::RandomAccessFile { public: - ObjectInputFile(std::shared_ptr blob_client, + ObjectInputFile(std::shared_ptr blob_client, const io::IOContext& io_context, AzureLocation location, int64_t size = kNoSize) : blob_client_(std::move(blob_client)), @@ -334,11 +396,11 @@ class ObjectInputFile final : public io::RandomAccessFile { content_length_ = properties.Value.BlobSize; metadata_ = PropertiesToMetadata(properties.Value); return Status::OK(); - } catch (const Azure::Storage::StorageException& exception) { - if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { + } catch (const Storage::StorageException& exception) { + if (exception.StatusCode == Http::HttpStatusCode::NotFound) { return PathNotFound(location_); } - return internal::ExceptionToStatus( + return ExceptionToStatus( "GetProperties failed for '" + blob_client_->GetUrl() + "' with an unexpected Azure error. Cannot initialise an ObjectInputFile " "without knowing the file size.", @@ -411,20 +473,20 @@ class ObjectInputFile final : public io::RandomAccessFile { } // Read the desired range of bytes - Azure::Core::Http::HttpRange range{position, nbytes}; - Azure::Storage::Blobs::DownloadBlobToOptions download_options; + Http::HttpRange range{position, nbytes}; + Storage::Blobs::DownloadBlobToOptions download_options; download_options.Range = range; try { return blob_client_ ->DownloadTo(reinterpret_cast(out), nbytes, download_options) .Value.ContentRange.Length.Value(); - } catch (const Azure::Storage::StorageException& exception) { - return internal::ExceptionToStatus("DownloadTo from '" + blob_client_->GetUrl() + - "' at position " + std::to_string(position) + - " for " + std::to_string(nbytes) + - " bytes failed with an Azure error. ReadAt " - "failed to read the required byte range.", - exception); + } catch (const Storage::StorageException& exception) { + return ExceptionToStatus("DownloadTo from '" + blob_client_->GetUrl() + + "' at position " + std::to_string(position) + " for " + + std::to_string(nbytes) + + " bytes failed with an Azure error. ReadAt " + "failed to read the required byte range.", + exception); } } @@ -459,7 +521,7 @@ class ObjectInputFile final : public io::RandomAccessFile { } private: - std::shared_ptr blob_client_; + std::shared_ptr blob_client_; const io::IOContext io_context_; AzureLocation location_; @@ -469,12 +531,11 @@ class ObjectInputFile final : public io::RandomAccessFile { std::shared_ptr metadata_; }; -Status CreateEmptyBlockBlob( - std::shared_ptr block_blob_client) { +Status CreateEmptyBlockBlob(std::shared_ptr block_blob_client) { try { block_blob_client->UploadFrom(nullptr, 0); - } catch (const Azure::Storage::StorageException& exception) { - return internal::ExceptionToStatus( + } catch (const Storage::StorageException& exception) { + return ExceptionToStatus( "UploadFrom failed for '" + block_blob_client->GetUrl() + "' with an unexpected Azure error. There is no existing blob at this " "location or the existing blob must be replaced so ObjectAppendStream must " @@ -484,12 +545,12 @@ Status CreateEmptyBlockBlob( return Status::OK(); } -Result GetBlockList( - std::shared_ptr block_blob_client) { +Result GetBlockList( + std::shared_ptr block_blob_client) { try { return block_blob_client->GetBlockList().Value; - } catch (Azure::Storage::StorageException& exception) { - return internal::ExceptionToStatus( + } catch (Storage::StorageException& exception) { + return ExceptionToStatus( "GetBlockList failed for '" + block_blob_client->GetUrl() + "' with an unexpected Azure error. Cannot write to a file without first " "fetching the existing block list.", @@ -497,19 +558,19 @@ Result GetBlockList( } } -Azure::Storage::Metadata ArrowMetadataToAzureMetadata( +Storage::Metadata ArrowMetadataToAzureMetadata( const std::shared_ptr& arrow_metadata) { - Azure::Storage::Metadata azure_metadata; + Storage::Metadata azure_metadata; for (auto key_value : arrow_metadata->sorted_pairs()) { azure_metadata[key_value.first] = key_value.second; } return azure_metadata; } -Status CommitBlockList( - std::shared_ptr block_blob_client, - const std::vector& block_ids, const Azure::Storage::Metadata& metadata) { - Azure::Storage::Blobs::CommitBlockListOptions options; +Status CommitBlockList(std::shared_ptr block_blob_client, + const std::vector& block_ids, + const Storage::Metadata& metadata) { + Blobs::CommitBlockListOptions options; options.Metadata = metadata; try { // CommitBlockList puts all block_ids in the latest element. That means in the case of @@ -517,8 +578,8 @@ Status CommitBlockList( // previously committed blocks. // https://learn.microsoft.com/en-us/rest/api/storageservices/put-block-list?tabs=microsoft-entra-id#request-body block_blob_client->CommitBlockList(block_ids, options); - } catch (const Azure::Storage::StorageException& exception) { - return internal::ExceptionToStatus( + } catch (const Storage::StorageException& exception) { + return ExceptionToStatus( "CommitBlockList failed for '" + block_blob_client->GetUrl() + "' with an unexpected Azure error. Committing is required to flush an " "output/append stream.", @@ -529,11 +590,10 @@ Status CommitBlockList( class ObjectAppendStream final : public io::OutputStream { public: - ObjectAppendStream( - std::shared_ptr block_blob_client, - const io::IOContext& io_context, const AzureLocation& location, - const std::shared_ptr& metadata, - const AzureOptions& options, int64_t size = kNoSize) + ObjectAppendStream(std::shared_ptr block_blob_client, + const io::IOContext& io_context, const AzureLocation& location, + const std::shared_ptr& metadata, + const AzureOptions& options, int64_t size = kNoSize) : block_blob_client_(std::move(block_blob_client)), io_context_(io_context), location_(location), @@ -560,11 +620,11 @@ class ObjectAppendStream final : public io::OutputStream { auto properties = block_blob_client_->GetProperties(); content_length_ = properties.Value.BlobSize; pos_ = content_length_; - } catch (const Azure::Storage::StorageException& exception) { - if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { + } catch (const Storage::StorageException& exception) { + if (exception.StatusCode == Http::HttpStatusCode::NotFound) { RETURN_NOT_OK(CreateEmptyBlockBlob(block_blob_client_)); } else { - return internal::ExceptionToStatus( + return ExceptionToStatus( "GetProperties failed for '" + block_blob_client_->GetUrl() + "' with an unexpected Azure error. Cannot initialise an " "ObjectAppendStream without knowing whether a file already exists at " @@ -634,7 +694,7 @@ class ObjectAppendStream final : public io::OutputStream { std::shared_ptr owned_buffer = nullptr) { RETURN_NOT_OK(CheckClosed("append")); auto append_data = reinterpret_cast(data); - Azure::Core::IO::MemoryBodyStream block_content(append_data, nbytes); + Core::IO::MemoryBodyStream block_content(append_data, nbytes); if (block_content.Length() == 0) { return Status::OK(); } @@ -657,13 +717,13 @@ class ObjectAppendStream final : public io::OutputStream { // if the blob was previously created with one block, with id `00001-arrow` then the // next block we append will conflict with that, and cause corruption. new_block_id += "-arrow"; - new_block_id = Azure::Core::Convert::Base64Encode( + new_block_id = Core::Convert::Base64Encode( std::vector(new_block_id.begin(), new_block_id.end())); try { block_blob_client_->StageBlock(new_block_id, block_content); - } catch (const Azure::Storage::StorageException& exception) { - return internal::ExceptionToStatus( + } catch (const Storage::StorageException& exception) { + return ExceptionToStatus( "StageBlock failed for '" + block_blob_client_->GetUrl() + "' new_block_id: '" + new_block_id + "' with an unexpected Azure error. Staging new blocks is fundamental to " @@ -676,7 +736,7 @@ class ObjectAppendStream final : public io::OutputStream { return Status::OK(); } - std::shared_ptr block_blob_client_; + std::shared_ptr block_blob_client_; const io::IOContext io_context_; const AzureLocation location_; @@ -684,7 +744,7 @@ class ObjectAppendStream final : public io::OutputStream { int64_t pos_ = 0; int64_t content_length_ = kNoSize; std::vector block_ids_; - Azure::Storage::Metadata metadata_; + Storage::Metadata metadata_; }; } // namespace @@ -693,27 +753,31 @@ class ObjectAppendStream final : public io::OutputStream { // AzureFilesystem Implementation class AzureFileSystem::Impl { - public: + private: io::IOContext io_context_; - std::unique_ptr - datalake_service_client_; - std::unique_ptr blob_service_client_; AzureOptions options_; - internal::HierarchicalNamespaceDetector hierarchical_namespace_; - explicit Impl(AzureOptions options, io::IOContext io_context) - : io_context_(io_context), options_(std::move(options)) {} + std::unique_ptr datalake_service_client_; + std::unique_ptr blob_service_client_; + internal::HierarchicalNamespaceDetector hns_detector_; - Status Init() { - blob_service_client_ = std::make_unique( - options_.account_blob_url, options_.storage_credentials_provider); - datalake_service_client_ = - std::make_unique( - options_.account_dfs_url, options_.storage_credentials_provider); - RETURN_NOT_OK(hierarchical_namespace_.Init(datalake_service_client_.get())); - return Status::OK(); - } + Impl(AzureOptions options, io::IOContext io_context) + : io_context_(std::move(io_context)), options_(std::move(options)) {} + public: + static Result> Make(AzureOptions options, + io::IOContext io_context) { + auto self = std::unique_ptr( + new AzureFileSystem::Impl(std::move(options), std::move(io_context))); + ARROW_ASSIGN_OR_RAISE(self->blob_service_client_, + self->options_.MakeBlobServiceClient()); + ARROW_ASSIGN_OR_RAISE(self->datalake_service_client_, + self->options_.MakeDataLakeServiceClient()); + RETURN_NOT_OK(self->hns_detector_.Init(self->datalake_service_client_.get())); + return self; + } + + io::IOContext& io_context() { return io_context_; } const AzureOptions& options() const { return options_; } public: @@ -722,12 +786,10 @@ class AzureFileSystem::Impl { info.set_path(location.all); if (location.container.empty()) { - // The location is invalid if the container is empty but not - // path. + // The location is invalid if the container is empty but the path is not. DCHECK(location.path.empty()); - // The location must refer to the root of the Azure storage - // account. This is a directory, and there isn't any extra - // metadata to fetch. + // This location must be derived from the root path. FileInfo should describe it + // as a directory and there isn't any extra metadata to fetch. info.set_type(FileType::Directory); return info; } @@ -739,20 +801,22 @@ class AzureFileSystem::Impl { auto properties = container_client.GetProperties(); info.set_type(FileType::Directory); info.set_mtime( - std::chrono::system_clock::time_point(properties.Value.LastModified)); + std::chrono::system_clock::time_point{properties.Value.LastModified}); return info; - } catch (const Azure::Storage::StorageException& exception) { - if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { + } catch (const Storage::StorageException& exception) { + if (IsContainerNotFound(exception)) { info.set_type(FileType::NotFound); return info; } - return internal::ExceptionToStatus( + return ExceptionToStatus( "GetProperties for '" + container_client.GetUrl() + "' failed with an unexpected Azure error. GetFileInfo is unable to " "determine whether the container exists.", exception); } } + + // There is a path to search within the container. auto file_client = datalake_service_client_->GetFileSystemClient(location.container) .GetFileClient(location.path); try { @@ -763,6 +827,8 @@ class AzureFileSystem::Impl { // For a path with a trailing slash a hierarchical namespace may return a blob // with that trailing slash removed. For consistency with flat namespace and // other filesystems we chose to return NotFound. + // + // NOTE(felipecrv): could this be an empty directory marker? info.set_type(FileType::NotFound); return info; } else { @@ -770,12 +836,12 @@ class AzureFileSystem::Impl { info.set_size(properties.Value.FileSize); } info.set_mtime( - std::chrono::system_clock::time_point(properties.Value.LastModified)); + std::chrono::system_clock::time_point{properties.Value.LastModified}); return info; - } catch (const Azure::Storage::StorageException& exception) { - if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { + } catch (const Storage::StorageException& exception) { + if (exception.StatusCode == Http::HttpStatusCode::NotFound) { ARROW_ASSIGN_OR_RAISE(auto hierarchical_namespace_enabled, - hierarchical_namespace_.Enabled(location.container)); + hns_detector_.Enabled(location.container)); if (hierarchical_namespace_enabled) { // If the hierarchical namespace is enabled, then the storage account will have // explicit directories. Neither a file nor a directory was found. @@ -784,12 +850,10 @@ class AzureFileSystem::Impl { } // On flat namespace accounts there are no real directories. Directories are only // implied by using `/` in the blob name. - Azure::Storage::Blobs::ListBlobsOptions list_blob_options; - + Blobs::ListBlobsOptions list_blob_options; // If listing the prefix `path.path_to_file` with trailing slash returns at least // one result then `path` refers to an implied directory. - auto prefix = internal::EnsureTrailingSlash(location.path); - list_blob_options.Prefix = prefix; + list_blob_options.Prefix = internal::EnsureTrailingSlash(location.path); // We only need to know if there is at least one result, so minimise page size // for efficiency. list_blob_options.PageSizeHint = 1; @@ -798,21 +862,19 @@ class AzureFileSystem::Impl { auto paged_list_result = blob_service_client_->GetBlobContainerClient(location.container) .ListBlobs(list_blob_options); - if (paged_list_result.Blobs.size() > 0) { - info.set_type(FileType::Directory); - } else { - info.set_type(FileType::NotFound); - } + auto file_type = paged_list_result.Blobs.size() > 0 ? FileType::Directory + : FileType::NotFound; + info.set_type(file_type); return info; - } catch (const Azure::Storage::StorageException& exception) { - return internal::ExceptionToStatus( - "ListBlobs for '" + prefix + + } catch (const Storage::StorageException& exception) { + return ExceptionToStatus( + "ListBlobs for '" + *list_blob_options.Prefix + "' failed with an unexpected Azure error. GetFileInfo is unable to " "determine whether the path should be considered an implied directory.", exception); } } - return internal::ExceptionToStatus( + return ExceptionToStatus( "GetProperties for '" + file_client.GetUrl() + "' failed with an unexpected " "Azure error. GetFileInfo is unable to determine whether the path exists.", @@ -822,9 +884,8 @@ class AzureFileSystem::Impl { private: template - Status VisitContainers(const Azure::Core::Context& context, - OnContainer&& on_container) const { - Azure::Storage::Blobs::ListBlobContainersOptions options; + Status VisitContainers(const Core::Context& context, OnContainer&& on_container) const { + Blobs::ListBlobContainersOptions options; try { auto container_list_response = blob_service_client_->ListBlobContainers(options, context); @@ -834,14 +895,14 @@ class AzureFileSystem::Impl { RETURN_NOT_OK(on_container(container)); } } - } catch (const Azure::Storage::StorageException& exception) { - return internal::ExceptionToStatus("Failed to list account containers.", exception); + } catch (const Storage::StorageException& exception) { + return ExceptionToStatus("Failed to list account containers.", exception); } return Status::OK(); } - static FileInfo FileInfoFromBlob(const std::string& container, - const Azure::Storage::Blobs::Models::BlobItem& blob) { + static FileInfo FileInfoFromBlob(std::string_view container, + const Blobs::Models::BlobItem& blob) { auto path = internal::ConcatAbstractPath(container, blob.Name); if (internal::HasTrailingSlash(blob.Name)) { return DirectoryFileInfoFromPath(path); @@ -852,7 +913,7 @@ class AzureFileSystem::Impl { return info; } - static FileInfo DirectoryFileInfoFromPath(const std::string& path) { + static FileInfo DirectoryFileInfoFromPath(std::string_view path) { return FileInfo{std::string{internal::RemoveTrailingSlash(path)}, FileType::Directory}; } @@ -870,13 +931,13 @@ class AzureFileSystem::Impl { /// \pre container_client is the client for the container named like the first /// segment of select.base_dir. Status GetFileInfoWithSelectorFromContainer( - const Azure::Storage::Blobs::BlobContainerClient& container_client, - const Azure::Core::Context& context, Azure::Nullable page_size_hint, - const FileSelector& select, FileInfoVector* acc_results) { + const Blobs::BlobContainerClient& container_client, const Core::Context& context, + Azure::Nullable page_size_hint, const FileSelector& select, + FileInfoVector* acc_results) { ARROW_ASSIGN_OR_RAISE(auto base_location, AzureLocation::FromString(select.base_dir)); bool found = false; - Azure::Storage::Blobs::ListBlobsOptions options; + Blobs::ListBlobsOptions options; if (internal::IsEmptyPath(base_location.path)) { // If the base_dir is the root of the container, then we want to list all blobs in // the container and the Prefix should be empty and not even include the trailing @@ -887,7 +948,7 @@ class AzureFileSystem::Impl { options.Prefix = internal::EnsureTrailingSlash(base_location.path); } options.PageSizeHint = page_size_hint; - options.Include = Azure::Storage::Blobs::Models::ListBlobsIncludeFlags::Metadata; + options.Include = Blobs::Models::ListBlobsIncludeFlags::Metadata; auto recurse = [&](const std::string& blob_prefix) noexcept -> Status { if (select.recursive && select.max_recursion > 0) { @@ -903,15 +964,14 @@ class AzureFileSystem::Impl { return Status::OK(); }; - auto process_blob = - [&](const Azure::Storage::Blobs::Models::BlobItem& blob) noexcept { - // blob.Name has trailing slash only when Prefix is an empty - // directory marker blob for the directory we're listing - // from, and we should skip it. - if (!internal::HasTrailingSlash(blob.Name)) { - acc_results->push_back(FileInfoFromBlob(base_location.container, blob)); - } - }; + auto process_blob = [&](const Blobs::Models::BlobItem& blob) noexcept { + // blob.Name has trailing slash only when Prefix is an empty + // directory marker blob for the directory we're listing + // from, and we should skip it. + if (!internal::HasTrailingSlash(blob.Name)) { + acc_results->push_back(FileInfoFromBlob(base_location.container, blob)); + } + }; auto process_prefix = [&](const std::string& prefix) noexcept -> Status { const auto path = internal::ConcatAbstractPath(base_location.container, prefix); acc_results->push_back(DirectoryFileInfoFromPath(path)); @@ -964,14 +1024,13 @@ class AzureFileSystem::Impl { RETURN_NOT_OK(process_prefix(list_response.BlobPrefixes[blob_prefix_index])); } } - } catch (const Azure::Storage::StorageException& exception) { - if (exception.ErrorCode == "ContainerNotFound") { + } catch (const Storage::StorageException& exception) { + if (IsContainerNotFound(exception)) { found = false; } else { - return internal::ExceptionToStatus( - "Failed to list blobs in a directory: " + select.base_dir + ": " + - container_client.GetUrl(), - exception); + return ExceptionToStatus("Failed to list blobs in a directory: " + + select.base_dir + ": " + container_client.GetUrl(), + exception); } } @@ -981,7 +1040,7 @@ class AzureFileSystem::Impl { } public: - Status GetFileInfoWithSelector(const Azure::Core::Context& context, + Status GetFileInfoWithSelector(const Core::Context& context, Azure::Nullable page_size_hint, const FileSelector& select, FileInfoVector* acc_results) { @@ -991,29 +1050,28 @@ class AzureFileSystem::Impl { // Without a container, the base_location is equivalent to the filesystem // root -- `/`. FileSelector::allow_not_found doesn't matter in this case // because the root always exists. - auto on_container = - [&](const Azure::Storage::Blobs::Models::BlobContainerItem& container) { - // Deleted containers are not listed by ListContainers. - DCHECK(!container.IsDeleted); - - // Every container is considered a directory. - FileInfo info{container.Name, FileType::Directory}; - info.set_mtime( - std::chrono::system_clock::time_point{container.Details.LastModified}); - acc_results->push_back(std::move(info)); - - // Recurse into containers (subdirectories) if requested. - if (select.recursive && select.max_recursion > 0) { - FileSelector sub_select; - sub_select.base_dir = container.Name; - sub_select.allow_not_found = true; - sub_select.recursive = true; - sub_select.max_recursion = select.max_recursion - 1; - ARROW_RETURN_NOT_OK(GetFileInfoWithSelector(context, page_size_hint, - sub_select, acc_results)); - } - return Status::OK(); - }; + auto on_container = [&](const Blobs::Models::BlobContainerItem& container) { + // Deleted containers are not listed by ListContainers. + DCHECK(!container.IsDeleted); + + // Every container is considered a directory. + FileInfo info{container.Name, FileType::Directory}; + info.set_mtime( + std::chrono::system_clock::time_point{container.Details.LastModified}); + acc_results->push_back(std::move(info)); + + // Recurse into containers (subdirectories) if requested. + if (select.recursive && select.max_recursion > 0) { + FileSelector sub_select; + sub_select.base_dir = container.Name; + sub_select.allow_not_found = true; + sub_select.recursive = true; + sub_select.max_recursion = select.max_recursion - 1; + ARROW_RETURN_NOT_OK( + GetFileInfoWithSelector(context, page_size_hint, sub_select, acc_results)); + } + return Status::OK(); + }; return VisitContainers(context, std::move(on_container)); } @@ -1026,7 +1084,7 @@ class AzureFileSystem::Impl { Result> OpenInputFile(const AzureLocation& location, AzureFileSystem* fs) { RETURN_NOT_OK(ValidateFileLocation(location)); - auto blob_client = std::make_shared( + auto blob_client = std::make_shared( blob_service_client_->GetBlobContainerClient(location.container) .GetBlobClient(location.path)); @@ -1046,7 +1104,7 @@ class AzureFileSystem::Impl { } ARROW_ASSIGN_OR_RAISE(auto location, AzureLocation::FromString(info.path())); RETURN_NOT_OK(ValidateFileLocation(location)); - auto blob_client = std::make_shared( + auto blob_client = std::make_shared( blob_service_client_->GetBlobContainerClient(location.container) .GetBlobClient(location.path)); @@ -1070,19 +1128,18 @@ class AzureFileSystem::Impl { return Status::OK(); } else { return StatusFromErrorResponse( - container_client.GetUrl(), response.RawResponse.get(), + container_client.GetUrl(), *response.RawResponse, "Failed to create a container: " + location.container); } - } catch (const Azure::Storage::StorageException& exception) { - return internal::ExceptionToStatus( - "Failed to create a container: " + location.container + ": " + - container_client.GetUrl(), - exception); + } catch (const Storage::StorageException& exception) { + return ExceptionToStatus("Failed to create a container: " + location.container + + ": " + container_client.GetUrl(), + exception); } } ARROW_ASSIGN_OR_RAISE(auto hierarchical_namespace_enabled, - hierarchical_namespace_.Enabled(location.container)); + hns_detector_.Enabled(location.container)); if (!hierarchical_namespace_enabled) { // Without hierarchical namespace enabled Azure blob storage has no directories. // Therefore we can't, and don't need to create one. Simply creating a blob with `/` @@ -1098,15 +1155,13 @@ class AzureFileSystem::Impl { if (response.Value.Created) { return Status::OK(); } else { - return StatusFromErrorResponse(directory_client.GetUrl(), - response.RawResponse.get(), + return StatusFromErrorResponse(directory_client.GetUrl(), *response.RawResponse, "Failed to create a directory: " + location.path); } - } catch (const Azure::Storage::StorageException& exception) { - return internal::ExceptionToStatus( - "Failed to create a directory: " + location.path + ": " + - directory_client.GetUrl(), - exception); + } catch (const Storage::StorageException& exception) { + return ExceptionToStatus("Failed to create a directory: " + location.path + ": " + + directory_client.GetUrl(), + exception); } } @@ -1119,15 +1174,14 @@ class AzureFileSystem::Impl { blob_service_client_->GetBlobContainerClient(location.container); try { container_client.CreateIfNotExists(); - } catch (const Azure::Storage::StorageException& exception) { - return internal::ExceptionToStatus( - "Failed to create a container: " + location.container + " (" + - container_client.GetUrl() + ")", - exception); + } catch (const Storage::StorageException& exception) { + return ExceptionToStatus("Failed to create a container: " + location.container + + " (" + container_client.GetUrl() + ")", + exception); } ARROW_ASSIGN_OR_RAISE(auto hierarchical_namespace_enabled, - hierarchical_namespace_.Enabled(location.container)); + hns_detector_.Enabled(location.container)); if (!hierarchical_namespace_enabled) { // Without hierarchical namespace enabled Azure blob storage has no directories. // Therefore we can't, and don't need to create one. Simply creating a blob with `/` @@ -1141,11 +1195,10 @@ class AzureFileSystem::Impl { .GetDirectoryClient(location.path); try { directory_client.CreateIfNotExists(); - } catch (const Azure::Storage::StorageException& exception) { - return internal::ExceptionToStatus( - "Failed to create a directory: " + location.path + " (" + - directory_client.GetUrl() + ")", - exception); + } catch (const Storage::StorageException& exception) { + return ExceptionToStatus("Failed to create a directory: " + location.path + " (" + + directory_client.GetUrl() + ")", + exception); } } @@ -1158,7 +1211,7 @@ class AzureFileSystem::Impl { AzureFileSystem* fs) { RETURN_NOT_OK(ValidateFileLocation(location)); - auto block_blob_client = std::make_shared( + auto block_blob_client = std::make_shared( blob_service_client_->GetBlobContainerClient(location.container) .GetBlockBlobClient(location.path)); @@ -1180,7 +1233,7 @@ class AzureFileSystem::Impl { bool missing_dir_ok) { auto container_client = blob_service_client_->GetBlobContainerClient(location.container); - Azure::Storage::Blobs::ListBlobsOptions options; + Blobs::ListBlobsOptions options; if (!location.path.empty()) { options.Prefix = internal::EnsureTrailingSlash(location.path); } @@ -1200,19 +1253,17 @@ class AzureFileSystem::Impl { continue; } auto batch = container_client.CreateBatch(); - std::vector> + std::vector> deferred_responses; for (const auto& blob_item : list_response.Blobs) { deferred_responses.push_back(batch.DeleteBlob(blob_item.Name)); } try { container_client.SubmitBatch(batch); - } catch (const Azure::Storage::StorageException& exception) { - return internal::ExceptionToStatus( - "Failed to delete blobs in a directory: " + location.path + ": " + - container_client.GetUrl(), - exception); + } catch (const Storage::StorageException& exception) { + return ExceptionToStatus("Failed to delete blobs in a directory: " + + location.path + ": " + container_client.GetUrl(), + exception); } std::vector failed_blob_names; for (size_t i = 0; i < deferred_responses.size(); ++i) { @@ -1221,7 +1272,7 @@ class AzureFileSystem::Impl { try { auto delete_result = deferred_response.GetResponse(); success = delete_result.Value.Deleted; - } catch (const Azure::Storage::StorageException& exception) { + } catch (const Storage::StorageException& exception) { success = false; } if (!success) { @@ -1240,11 +1291,10 @@ class AzureFileSystem::Impl { } } } - } catch (const Azure::Storage::StorageException& exception) { - return internal::ExceptionToStatus( - "Failed to list blobs in a directory: " + location.path + ": " + - container_client.GetUrl(), - exception); + } catch (const Storage::StorageException& exception) { + return ExceptionToStatus("Failed to list blobs in a directory: " + location.path + + ": " + container_client.GetUrl(), + exception); } return Status::OK(); } @@ -1264,19 +1314,18 @@ class AzureFileSystem::Impl { return Status::OK(); } else { return StatusFromErrorResponse( - container_client.GetUrl(), response.RawResponse.get(), + container_client.GetUrl(), *response.RawResponse, "Failed to delete a container: " + location.container); } - } catch (const Azure::Storage::StorageException& exception) { - return internal::ExceptionToStatus( - "Failed to delete a container: " + location.container + ": " + - container_client.GetUrl(), - exception); + } catch (const Storage::StorageException& exception) { + return ExceptionToStatus("Failed to delete a container: " + location.container + + ": " + container_client.GetUrl(), + exception); } } ARROW_ASSIGN_OR_RAISE(auto hierarchical_namespace_enabled, - hierarchical_namespace_.Enabled(location.container)); + hns_detector_.Enabled(location.container)); if (hierarchical_namespace_enabled) { auto directory_client = datalake_service_client_->GetFileSystemClient(location.container) @@ -1287,14 +1336,13 @@ class AzureFileSystem::Impl { return Status::OK(); } else { return StatusFromErrorResponse( - directory_client.GetUrl(), response.RawResponse.get(), + directory_client.GetUrl(), *response.RawResponse, "Failed to delete a directory: " + location.path); } - } catch (const Azure::Storage::StorageException& exception) { - return internal::ExceptionToStatus( - "Failed to delete a directory: " + location.path + ": " + - directory_client.GetUrl(), - exception); + } catch (const Storage::StorageException& exception) { + return ExceptionToStatus("Failed to delete a directory: " + location.path + ": " + + directory_client.GetUrl(), + exception); } } else { return DeleteDirContentsWithoutHierarchicalNamespace(location, @@ -1308,7 +1356,7 @@ class AzureFileSystem::Impl { } ARROW_ASSIGN_OR_RAISE(auto hierarchical_namespace_enabled, - hierarchical_namespace_.Enabled(location.container)); + hns_detector_.Enabled(location.container)); if (hierarchical_namespace_enabled) { auto file_system_client = datalake_service_client_->GetFileSystemClient(location.container); @@ -1322,8 +1370,8 @@ class AzureFileSystem::Impl { file_system_client.GetDirectoryClient(path.Name); try { sub_directory_client.DeleteRecursive(); - } catch (const Azure::Storage::StorageException& exception) { - return internal::ExceptionToStatus( + } catch (const Storage::StorageException& exception) { + return ExceptionToStatus( "Failed to delete a sub directory: " + location.container + internal::kSep + path.Name + ": " + sub_directory_client.GetUrl(), exception); @@ -1332,8 +1380,8 @@ class AzureFileSystem::Impl { auto sub_file_client = file_system_client.GetFileClient(path.Name); try { sub_file_client.Delete(); - } catch (const Azure::Storage::StorageException& exception) { - return internal::ExceptionToStatus( + } catch (const Storage::StorageException& exception) { + return ExceptionToStatus( "Failed to delete a sub file: " + location.container + internal::kSep + path.Name + ": " + sub_file_client.GetUrl(), exception); @@ -1341,15 +1389,13 @@ class AzureFileSystem::Impl { } } } - } catch (const Azure::Storage::StorageException& exception) { - if (missing_dir_ok && - exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { + } catch (const Storage::StorageException& exception) { + if (missing_dir_ok && exception.StatusCode == Http::HttpStatusCode::NotFound) { return Status::OK(); } else { - return internal::ExceptionToStatus( - "Failed to delete directory contents: " + location.path + ": " + - directory_client.GetUrl(), - exception); + return ExceptionToStatus("Failed to delete directory contents: " + + location.path + ": " + directory_client.GetUrl(), + exception); } } return Status::OK(); @@ -1371,8 +1417,8 @@ class AzureFileSystem::Impl { .GetUrl(); try { dest_blob_client.CopyFromUri(src_url); - } catch (const Azure::Storage::StorageException& exception) { - return internal::ExceptionToStatus( + } catch (const Storage::StorageException& exception) { + return ExceptionToStatus( "Failed to copy a blob. (" + src_url + " -> " + dest_blob_client.GetUrl() + ")", exception); } @@ -1380,6 +1426,17 @@ class AzureFileSystem::Impl { } }; +AzureFileSystem::AzureFileSystem(std::unique_ptr&& impl) + : FileSystem(impl->io_context()), impl_(std::move(impl)) { + default_async_is_sync_ = false; +} + +Result> AzureFileSystem::Make( + const AzureOptions& options, const io::IOContext& io_context) { + ARROW_ASSIGN_OR_RAISE(auto impl, AzureFileSystem::Impl::Make(options, io_context)); + return std::shared_ptr(new AzureFileSystem(std::move(impl))); +} + const AzureOptions& AzureFileSystem::options() const { return impl_->options(); } bool AzureFileSystem::Equals(const FileSystem& other) const { @@ -1399,7 +1456,7 @@ Result AzureFileSystem::GetFileInfo(const std::string& path) { } Result AzureFileSystem::GetFileInfo(const FileSelector& select) { - Azure::Core::Context context; + Core::Context context; Azure::Nullable page_size_hint; // unspecified FileInfoVector results; RETURN_NOT_OK( @@ -1478,18 +1535,4 @@ Result> AzureFileSystem::OpenAppendStream( return impl_->OpenAppendStream(location, metadata, false, this); } -Result> AzureFileSystem::Make( - const AzureOptions& options, const io::IOContext& io_context) { - std::shared_ptr ptr(new AzureFileSystem(options, io_context)); - RETURN_NOT_OK(ptr->impl_->Init()); - return ptr; -} - -AzureFileSystem::AzureFileSystem(const AzureOptions& options, - const io::IOContext& io_context) - : FileSystem(io_context), impl_(std::make_unique(options, io_context)) { - default_async_is_sync_ = false; -} - -} // namespace fs -} // namespace arrow +} // namespace arrow::fs diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h index b2865b059ef6e..1266aa2d02b86 100644 --- a/cpp/src/arrow/filesystem/azurefs.h +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -25,90 +25,118 @@ #include "arrow/util/macros.h" #include "arrow/util/uri.h" -namespace Azure { -namespace Core { -namespace Credentials { - +namespace Azure::Core::Credentials { class TokenCredential; +} -} // namespace Credentials -} // namespace Core -namespace Storage { - +namespace Azure::Storage { class StorageSharedKeyCredential; +} -} // namespace Storage -} // namespace Azure - -namespace arrow { -namespace fs { - -enum class AzureCredentialsKind : int8_t { - /// Anonymous access (no credentials used), public - Anonymous, - /// Use explicitly-provided access key pair - StorageCredentials, - /// Use ServicePrincipleCredentials - ServicePrincipleCredentials, - /// Use Sas Token to authenticate - Sas, - /// Use Connection String - ConnectionString -}; +namespace Azure::Storage::Blobs { +class BlobServiceClient; +} + +namespace Azure::Storage::Files::DataLake { +class DataLakeServiceClient; +} -enum class AzureBackend : bool { - /// Official Azure Remote Backend - Azure, - /// Local Simulated Storage - Azurite +namespace arrow::fs { + +enum class AzureBackend { + /// \brief Official Azure Remote Backend + kAzure, + /// \brief Local Simulated Storage + kAzurite }; /// Options for the AzureFileSystem implementation. struct ARROW_EXPORT AzureOptions { - std::string account_dfs_url; - std::string account_blob_url; - AzureBackend backend = AzureBackend::Azure; - AzureCredentialsKind credentials_kind = AzureCredentialsKind::Anonymous; + /// \brief The backend to connect to: Azure or Azurite (for testing). + AzureBackend backend = AzureBackend::kAzure; - std::string sas_token; - std::string connection_string; - std::shared_ptr - storage_credentials_provider; - std::shared_ptr - service_principle_credentials_provider; + // TODO(GH-38598): Add support for more auth methods. + // std::string connection_string; + // std::string sas_token; /// \brief Default metadata for OpenOutputStream. /// /// This will be ignored if non-empty metadata is passed to OpenOutputStream. std::shared_ptr default_metadata; + private: + std::string account_blob_url_; + std::string account_dfs_url_; + + enum class CredentialKind { + kAnonymous, + kStorageSharedKeyCredential, + } credential_kind_ = CredentialKind::kAnonymous; + + std::shared_ptr + storage_shared_key_credential_; + + public: AzureOptions(); + ~AzureOptions(); - Status ConfigureAccountKeyCredentials(const std::string& account_name, - const std::string& account_key); + Status ConfigureAccountKeyCredential(const std::string& account_name, + const std::string& account_key); bool Equals(const AzureOptions& other) const; + + const std::string& AccountBlobUrl() const { return account_blob_url_; } + const std::string& AccountDfsUrl() const { return account_dfs_url_; } + + Result> + MakeBlobServiceClient() const; + + Result> + MakeDataLakeServiceClient() const; }; -/// \brief Azure-backed FileSystem implementation for ABFS and ADLS. +/// \brief FileSystem implementation backed by Azure Blob Storage (ABS) [1] and +/// Azure Data Lake Storage Gen2 (ADLS Gen2) [2]. +/// +/// ADLS Gen2 isn't a dedicated service or account type. It's a set of capabilities that +/// support high throughput analytic workloads, built on Azure Blob Storage. All the data +/// ingested via the ADLS Gen2 APIs is persisted as blobs in the storage account. +/// ADLS Gen2 provides filesystem semantics, file-level security, and Hadoop +/// compatibility. ADLS Gen1 exists as a separate object that will retired on 2024-02-29 +/// and new ADLS accounts use Gen2 instead. /// -/// ABFS (Azure Blob Storage - https://azure.microsoft.com/en-us/products/storage/blobs/) -/// object-based cloud storage system. +/// ADLS Gen2 and Blob APIs can operate on the same data, but there are +/// some limitations [3]. The ones that are relevant to this +/// implementation are listed here: /// -/// ADLS (Azure Data Lake Storage - -/// https://azure.microsoft.com/en-us/products/storage/data-lake-storage/) -/// is a scalable data storage system designed for big-data applications. -/// ADLS provides filesystem semantics, file-level security, and Hadoop -/// compatibility. Gen1 exists as a separate object that will retired -/// on Feb 29, 2024. New ADLS accounts will use Gen2 instead, which is -/// implemented on top of ABFS. +/// - You can't use Blob APIs, and ADLS APIs to write to the same instance of a file. If +/// you write to a file by using ADLS APIs then that file's blocks won't be visible +/// to calls to the GetBlockList Blob API. The only exception is when you're +/// overwriting. +/// - When you use the ListBlobs operation without specifying a delimiter, the results +/// include both directories and blobs. If you choose to use a delimiter, use only a +/// forward slash (/) -- the only supported delimiter. +/// - If you use the DeleteBlob API to delete a directory, that directory is deleted only +/// if it's empty. This means that you can't use the Blob API delete directories +/// recursively. /// -/// TODO: GH-18014 Complete the internal implementation -/// and review the documentation +/// [1]: https://azure.microsoft.com/en-us/products/storage/blobs +/// [2]: https://azure.microsoft.com/en-us/products/storage/data-lake-storage +/// [3]: +/// https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-known-issues class ARROW_EXPORT AzureFileSystem : public FileSystem { + private: + class Impl; + std::unique_ptr impl_; + + explicit AzureFileSystem(std::unique_ptr&& impl); + public: ~AzureFileSystem() override = default; + static Result> Make( + const AzureOptions& options, const io::IOContext& = io::default_io_context()); + std::string type_name() const override { return "abfs"; } /// Return the original Azure options when constructing the filesystem @@ -152,16 +180,6 @@ class ARROW_EXPORT AzureFileSystem : public FileSystem { Result> OpenAppendStream( const std::string& path, const std::shared_ptr& metadata = {}) override; - - static Result> Make( - const AzureOptions& options, const io::IOContext& = io::default_io_context()); - - private: - AzureFileSystem(const AzureOptions& options, const io::IOContext& io_context); - - class Impl; - std::unique_ptr impl_; }; -} // namespace fs -} // namespace arrow +} // namespace arrow::fs diff --git a/cpp/src/arrow/filesystem/azurefs_internal.cc b/cpp/src/arrow/filesystem/azurefs_internal.cc index 3e545d670cb04..39c3fb23e3cfd 100644 --- a/cpp/src/arrow/filesystem/azurefs_internal.cc +++ b/cpp/src/arrow/filesystem/azurefs_internal.cc @@ -23,11 +23,17 @@ namespace arrow::fs::internal { +namespace { + +// TODO(GH-38772): Remove azurefs_internal.h/.cc by moving the detector to +// azurefs.cc (which contains a private copy of this helper function already). Status ExceptionToStatus(const std::string& prefix, const Azure::Storage::StorageException& exception) { return Status::IOError(prefix, " Azure Error: ", exception.what()); } +} // namespace + Status HierarchicalNamespaceDetector::Init( Azure::Storage::Files::DataLake::DataLakeServiceClient* datalake_service_client) { datalake_service_client_ = datalake_service_client; diff --git a/cpp/src/arrow/filesystem/azurefs_internal.h b/cpp/src/arrow/filesystem/azurefs_internal.h index c3da96239a18f..92592cf164f5a 100644 --- a/cpp/src/arrow/filesystem/azurefs_internal.h +++ b/cpp/src/arrow/filesystem/azurefs_internal.h @@ -25,9 +25,6 @@ namespace arrow::fs::internal { -Status ExceptionToStatus(const std::string& prefix, - const Azure::Storage::StorageException& exception); - class HierarchicalNamespaceDetector { public: Status Init( diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 792c63b209402..463ff4e8daf3d 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -36,6 +36,7 @@ #include "arrow/filesystem/azurefs.h" #include "arrow/filesystem/azurefs_internal.h" +#include #include #include @@ -63,6 +64,7 @@ namespace arrow { using internal::TemporaryDir; namespace fs { +using internal::ConcatAbstractPath; namespace { namespace bp = boost::process; @@ -71,56 +73,133 @@ using ::testing::Not; using ::testing::NotNull; namespace Blobs = Azure::Storage::Blobs; -namespace Files = Azure::Storage::Files; +namespace Core = Azure::Core; +namespace DataLake = Azure::Storage::Files::DataLake; -auto const* kLoremIpsum = R"""( -Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor -incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis -nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. -Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu -fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in -culpa qui officia deserunt mollit anim id est laborum. -)"""; +class BaseAzureEnv : public ::testing::Environment { + protected: + std::string account_name_; + std::string account_key_; + + BaseAzureEnv(std::string account_name, std::string account_key) + : account_name_(std::move(account_name)), account_key_(std::move(account_key)) {} -class AzuriteEnv : public ::testing::Environment { public: - AzuriteEnv() { - account_name_ = "devstoreaccount1"; - account_key_ = - "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/" - "KBHBeksoGMGw=="; - auto exe_path = bp::search_path("azurite"); - if (exe_path.empty()) { - auto error = std::string("Could not find Azurite emulator."); - status_ = Status::Invalid(error); - return; + const std::string& account_name() const { return account_name_; } + const std::string& account_key() const { return account_key_; } + + virtual AzureBackend backend() const = 0; + + virtual bool WithHierarchicalNamespace() const { return false; } + + virtual Result GetDebugLogSize() { return 0; } + virtual Status DumpDebugLog(int64_t position) { + return Status::NotImplemented("BaseAzureEnv::DumpDebugLog"); + } +}; + +template +class AzureEnvImpl : public BaseAzureEnv { + private: + /// \brief Factory function that registers the singleton instance as a global test + /// environment. Must be called only once per implementation (see GetInstance()). + /// + /// Every BaseAzureEnv implementation defines a static and parameter-less member + /// function called Make() that returns a Result>. + /// This templated function performs the following steps: + /// + /// 1) Calls AzureEnvClass::Make() to get an instance of AzureEnvClass. + /// 2) Passes ownership of the AzureEnvClass instance to the testing environment. + /// 3) Returns a Result wrapping the raw heap-allocated pointer. + static Result MakeAndAddToGlobalTestEnvironment() { + ARROW_ASSIGN_OR_RAISE(auto env, AzureEnvClass::Make()); + auto* heap_ptr = env.release(); + ::testing::AddGlobalTestEnvironment(heap_ptr); + return heap_ptr; + } + + protected: + using BaseAzureEnv::BaseAzureEnv; + + /// \brief Create an AzureEnvClass instance from environment variables. + /// + /// Reads the account name and key from the environment variables. This can be + /// used in BaseAzureEnv implementations that don't need to do any additional + /// setup to create the singleton instance (e.g. AzureFlatNSEnv, + /// AzureHierarchicalNSEnv). + static Result> MakeFromEnvVars( + const std::string& account_name_var, const std::string& account_key_var) { + const auto account_name = std::getenv(account_name_var.c_str()); + const auto account_key = std::getenv(account_key_var.c_str()); + if (!account_name && !account_key) { + return Status::Cancelled(account_name_var + " and " + account_key_var + + " are not set. Skipping tests."); } - auto temp_dir_ = *TemporaryDir::Make("azurefs-test-"); - auto debug_log_path_result = temp_dir_->path().Join("debug.log"); - if (!debug_log_path_result.ok()) { - status_ = debug_log_path_result.status(); - return; + // If only one of the variables is set. Don't cancel tests, + // fail with a Status::Invalid. + if (!account_name) { + return Status::Invalid(account_name_var + " not set while " + account_key_var + + " is set."); } - debug_log_path_ = *debug_log_path_result; - server_process_ = - bp::child(boost::this_process::environment(), exe_path, "--silent", "--location", - temp_dir_->path().ToString(), "--debug", debug_log_path_.ToString()); - if (!(server_process_.valid() && server_process_.running())) { - auto error = "Could not start Azurite emulator."; - server_process_.terminate(); - server_process_.wait(); - status_ = Status::Invalid(error); - return; + if (!account_key) { + return Status::Invalid(account_key_var + " not set while " + account_name_var + + " is set."); } - status_ = Status::OK(); + return std::unique_ptr{new AzureEnvClass(account_name, account_key)}; } + public: + static Result GetInstance() { + // Ensure MakeAndAddToGlobalTestEnvironment() is called only once by storing the + // Result in a static variable. + static auto singleton_env = MakeAndAddToGlobalTestEnvironment(); + return singleton_env; + } + + AzureBackend backend() const final { return AzureEnvClass::kBackend; } +}; + +class AzuriteEnv : public AzureEnvImpl { + private: + std::unique_ptr temp_dir_; + arrow::internal::PlatformFilename debug_log_path_; + bp::child server_process_; + + using AzureEnvImpl::AzureEnvImpl; + + public: + static const AzureBackend kBackend = AzureBackend::kAzurite; + ~AzuriteEnv() override { server_process_.terminate(); server_process_.wait(); } - Result GetDebugLogSize() { + static Result> Make() { + auto self = std::unique_ptr( + new AzuriteEnv("devstoreaccount1", + "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/" + "K1SZFPTOtr/KBHBeksoGMGw==")); + auto exe_path = bp::search_path("azurite"); + if (exe_path.empty()) { + return Status::Invalid("Could not find Azurite emulator."); + } + ARROW_ASSIGN_OR_RAISE(self->temp_dir_, TemporaryDir::Make("azurefs-test-")); + ARROW_ASSIGN_OR_RAISE(self->debug_log_path_, + self->temp_dir_->path().Join("debug.log")); + auto server_process = bp::child( + boost::this_process::environment(), exe_path, "--silent", "--location", + self->temp_dir_->path().ToString(), "--debug", self->debug_log_path_.ToString()); + if (!server_process.valid() || !server_process.running()) { + server_process.terminate(); + server_process.wait(); + return Status::Invalid("Could not start Azurite emulator."); + } + self->server_process_ = std::move(server_process); + return self; + } + + Result GetDebugLogSize() override { ARROW_ASSIGN_OR_RAISE(auto exists, arrow::internal::FileExists(debug_log_path_)); if (!exists) { return 0; @@ -131,7 +210,7 @@ class AzuriteEnv : public ::testing::Environment { return arrow::internal::FileTell(file_descriptor.fd()); } - Status DumpDebugLog(int64_t position = 0) { + Status DumpDebugLog(int64_t position) override { ARROW_ASSIGN_OR_RAISE(auto exists, arrow::internal::FileExists(debug_log_path_)); if (!exists) { return Status::OK(); @@ -157,25 +236,35 @@ class AzuriteEnv : public ::testing::Environment { std::cerr << std::endl; return Status::OK(); } +}; - const std::string& account_name() const { return account_name_; } - const std::string& account_key() const { return account_key_; } - const Status status() const { return status_; } - +class AzureFlatNSEnv : public AzureEnvImpl { private: - std::string account_name_; - std::string account_key_; - bp::child server_process_; - Status status_; - std::unique_ptr temp_dir_; - arrow::internal::PlatformFilename debug_log_path_; + using AzureEnvImpl::AzureEnvImpl; + + public: + static const AzureBackend kBackend = AzureBackend::kAzure; + + static Result> Make() { + return MakeFromEnvVars("AZURE_FLAT_NAMESPACE_ACCOUNT_NAME", + "AZURE_FLAT_NAMESPACE_ACCOUNT_KEY"); + } }; -auto* azurite_env = ::testing::AddGlobalTestEnvironment(new AzuriteEnv); +class AzureHierarchicalNSEnv : public AzureEnvImpl { + private: + using AzureEnvImpl::AzureEnvImpl; -AzuriteEnv* GetAzuriteEnv() { - return ::arrow::internal::checked_cast(azurite_env); -} + public: + static const AzureBackend kBackend = AzureBackend::kAzure; + + static Result> Make() { + return MakeFromEnvVars("AZURE_HIERARCHICAL_NAMESPACE_ACCOUNT_NAME", + "AZURE_HIERARCHICAL_NAMESPACE_ACCOUNT_KEY"); + } + + bool WithHierarchicalNamespace() const final { return true; } +}; // Placeholder tests // TODO: GH-18014 Remove once a proper test is added @@ -193,44 +282,110 @@ TEST(AzureFileSystem, OptionsCompare) { EXPECT_TRUE(options.Equals(options)); } -class AzureFileSystemTest : public ::testing::Test { +struct PreexistingData { + public: + using RNG = std::mt19937_64; + + public: + const std::string container_name; + static constexpr char const* kObjectName = "test-object-name"; + + static constexpr char const* kLoremIpsum = R"""( +Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor +incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis +nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. +Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu +fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in +culpa qui officia deserunt mollit anim id est laborum. +)"""; + public: + explicit PreexistingData(RNG& rng) : container_name{RandomContainerName(rng)} {} + + // Creates a path by concatenating the container name and the stem. + std::string ContainerPath(std::string_view stem) const { + return ConcatAbstractPath(container_name, stem); + } + + std::string ObjectPath() const { return ContainerPath(kObjectName); } + std::string NotFoundObjectPath() const { return ContainerPath("not-found"); } + + std::string RandomDirectoryPath(RNG& rng) const { + return ContainerPath(RandomChars(32, rng)); + } + + // Utilities + static std::string RandomContainerName(RNG& rng) { return RandomChars(32, rng); } + + static std::string RandomChars(int count, RNG& rng) { + auto const fillers = std::string("abcdefghijlkmnopqrstuvwxyz0123456789"); + std::uniform_int_distribution d(0, static_cast(fillers.size()) - 1); + std::string s; + std::generate_n(std::back_inserter(s), count, [&] { return fillers[d(rng)]; }); + return s; + } + + static int RandomIndex(int end, RNG& rng) { + return std::uniform_int_distribution(0, end - 1)(rng); + } + + static std::string RandomLine(int lineno, int width, RNG& rng) { + auto line = std::to_string(lineno) + ": "; + line += RandomChars(width - static_cast(line.size()) - 1, rng); + line += '\n'; + return line; + } +}; + +class TestAzureFileSystem : public ::testing::Test { + protected: + // Set in constructor + std::mt19937_64 rng_; + + // Set in SetUp() + int64_t debug_log_start_ = 0; + bool set_up_succeeded_ = false; + AzureOptions options_; + std::shared_ptr fs_; std::unique_ptr blob_service_client_; - std::unique_ptr datalake_service_client_; - AzureOptions options_; - std::mt19937_64 generator_; - std::string container_name_; - bool suite_skipped_ = false; + std::unique_ptr datalake_service_client_; + + public: + TestAzureFileSystem() : rng_(std::random_device()()) {} - AzureFileSystemTest() : generator_(std::random_device()()) {} + virtual Result GetAzureEnv() const = 0; - virtual Result MakeOptions() = 0; + static Result MakeOptions(BaseAzureEnv* env) { + AzureOptions options; + options.backend = env->backend(); + ARROW_EXPECT_OK( + options.ConfigureAccountKeyCredential(env->account_name(), env->account_key())); + return options; + } void SetUp() override { - auto options = MakeOptions(); - if (options.ok()) { - options_ = *options; + auto make_options = [this]() -> Result { + ARROW_ASSIGN_OR_RAISE(auto env, GetAzureEnv()); + EXPECT_THAT(env, NotNull()); + ARROW_ASSIGN_OR_RAISE(debug_log_start_, env->GetDebugLogSize()); + return MakeOptions(env); + }; + auto options_res = make_options(); + if (options_res.status().IsCancelled()) { + GTEST_SKIP() << options_res.status().message(); } else { - suite_skipped_ = true; - GTEST_SKIP() << options.status().message(); + EXPECT_OK_AND_ASSIGN(options_, options_res); } - // Stop-gap solution before GH-39119 is fixed. - container_name_ = "z" + RandomChars(31); - blob_service_client_ = std::make_unique( - options_.account_blob_url, options_.storage_credentials_provider); - datalake_service_client_ = std::make_unique( - options_.account_dfs_url, options_.storage_credentials_provider); - ASSERT_OK_AND_ASSIGN(fs_, AzureFileSystem::Make(options_)); - auto container_client = CreateContainer(container_name_); - auto blob_client = container_client.GetBlockBlobClient(PreexistingObjectName()); - blob_client.UploadFrom(reinterpret_cast(kLoremIpsum), - strlen(kLoremIpsum)); + ASSERT_OK_AND_ASSIGN(fs_, AzureFileSystem::Make(options_)); + EXPECT_OK_AND_ASSIGN(blob_service_client_, options_.MakeBlobServiceClient()); + EXPECT_OK_AND_ASSIGN(datalake_service_client_, options_.MakeDataLakeServiceClient()); + set_up_succeeded_ = true; } void TearDown() override { - if (!suite_skipped_) { + if (set_up_succeeded_) { auto containers = blob_service_client_->ListBlobContainers(); for (auto container : containers.BlobContainers) { auto container_client = @@ -238,6 +393,13 @@ class AzureFileSystemTest : public ::testing::Test { container_client.DeleteIfExists(); } } + if (HasFailure()) { + // XXX: This may not include all logs in the target test because + // Azurite doesn't flush debug logs immediately... You may want + // to check the log manually... + EXPECT_OK_AND_ASSIGN(auto env, GetAzureEnv()); + ARROW_IGNORE_EXPR(env->DumpDebugLog(debug_log_start_)); + } } Blobs::BlobContainerClient CreateContainer(const std::string& name) { @@ -254,54 +416,20 @@ class AzureFileSystemTest : public ::testing::Test { return blob_client; } - std::string PreexistingContainerName() const { return container_name_; } - - std::string PreexistingContainerPath() const { - return PreexistingContainerName() + '/'; - } - - static std::string PreexistingObjectName() { return "test-object-name"; } - - std::string PreexistingObjectPath() const { - return PreexistingContainerPath() + PreexistingObjectName(); - } - - std::string NotFoundObjectPath() { return PreexistingContainerPath() + "not-found"; } - - std::string RandomLine(int lineno, std::size_t width) { - auto line = std::to_string(lineno) + ": "; - line += RandomChars(width - line.size() - 1); - line += '\n'; - return line; - } - - std::size_t RandomIndex(std::size_t end) { - return std::uniform_int_distribution(0, end - 1)(generator_); - } - - std::string RandomChars(std::size_t count) { - auto const fillers = std::string("abcdefghijlkmnopqrstuvwxyz0123456789"); - std::uniform_int_distribution d(0, fillers.size() - 1); - std::string s; - std::generate_n(std::back_inserter(s), count, [&] { return fillers[d(generator_)]; }); - return s; - } - - std::string RandomContainerName() { return RandomChars(32); } - - std::string RandomDirectoryName() { return RandomChars(32); } - - void UploadLines(const std::vector& lines, const char* path_to_file, + void UploadLines(const std::vector& lines, const std::string& path, int total_size) { - const auto path = PreexistingContainerPath() + path_to_file; ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(path, {})); const auto all_lines = std::accumulate(lines.begin(), lines.end(), std::string("")); ASSERT_OK(output->Write(all_lines)); ASSERT_OK(output->Close()); } - void RunGetFileInfoObjectWithNestedStructureTest(); - void RunGetFileInfoObjectTest(); + PreexistingData SetUpPreexistingData() { + PreexistingData data(rng_); + auto container_client = CreateContainer(data.container_name); + CreateBlob(container_client, data.kObjectName, PreexistingData::kLoremIpsum); + return data; + } struct HierarchicalPaths { std::string container; @@ -310,15 +438,12 @@ class AzureFileSystemTest : public ::testing::Test { }; // Need to use "void" as the return type to use ASSERT_* in this method. - void CreateHierarchicalData(HierarchicalPaths& paths) { - const auto container_path = RandomContainerName(); - const auto directory_path = - internal::ConcatAbstractPath(container_path, RandomDirectoryName()); - const auto sub_directory_path = - internal::ConcatAbstractPath(directory_path, "new-sub"); - const auto sub_blob_path = - internal::ConcatAbstractPath(sub_directory_path, "sub.txt"); - const auto top_blob_path = internal::ConcatAbstractPath(directory_path, "top.txt"); + void CreateHierarchicalData(HierarchicalPaths* paths) { + auto data = SetUpPreexistingData(); + const auto directory_path = data.RandomDirectoryPath(rng_); + const auto sub_directory_path = ConcatAbstractPath(directory_path, "new-sub"); + const auto sub_blob_path = ConcatAbstractPath(sub_directory_path, "sub.txt"); + const auto top_blob_path = ConcatAbstractPath(directory_path, "top.txt"); ASSERT_OK(fs_->CreateDir(sub_directory_path, true)); ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(sub_blob_path)); ASSERT_OK(output->Write(std::string_view("sub"))); @@ -327,15 +452,15 @@ class AzureFileSystemTest : public ::testing::Test { ASSERT_OK(output->Write(std::string_view("top"))); ASSERT_OK(output->Close()); - AssertFileInfo(fs_.get(), container_path, FileType::Directory); + AssertFileInfo(fs_.get(), data.container_name, FileType::Directory); AssertFileInfo(fs_.get(), directory_path, FileType::Directory); AssertFileInfo(fs_.get(), sub_directory_path, FileType::Directory); AssertFileInfo(fs_.get(), sub_blob_path, FileType::File); AssertFileInfo(fs_.get(), top_blob_path, FileType::File); - paths.container = container_path; - paths.directory = directory_path; - paths.sub_paths = { + paths->container = data.container_name; + paths->directory = directory_path; + paths->sub_paths = { sub_directory_path, sub_blob_path, top_blob_path, @@ -362,7 +487,7 @@ class AzureFileSystemTest : public ::testing::Test { } void AssertInfoAllContainersRecursive(const std::vector& infos) { - ASSERT_EQ(infos.size(), 14); + ASSERT_EQ(infos.size(), 12); AssertFileInfo(infos[0], "container", FileType::Directory); AssertFileInfo(infos[1], "container/emptydir", FileType::Directory); AssertFileInfo(infos[2], "container/otherdir", FileType::Directory); @@ -377,202 +502,336 @@ class AzureFileSystemTest : public ::testing::Test { strlen(kSubData)); AssertFileInfo(infos[10], "container/somefile", FileType::File, strlen(kSomeData)); AssertFileInfo(infos[11], "empty-container", FileType::Directory); - AssertFileInfo(infos[12], PreexistingContainerName(), FileType::Directory); - AssertFileInfo(infos[13], PreexistingObjectPath(), FileType::File); } -}; -class AzuriteFileSystemTest : public AzureFileSystemTest { - Result MakeOptions() override { - EXPECT_THAT(GetAzuriteEnv(), NotNull()); - ARROW_EXPECT_OK(GetAzuriteEnv()->status()); - ARROW_ASSIGN_OR_RAISE(debug_log_start_, GetAzuriteEnv()->GetDebugLogSize()); - AzureOptions options; - options.backend = AzureBackend::Azurite; - ARROW_EXPECT_OK(options.ConfigureAccountKeyCredentials( - GetAzuriteEnv()->account_name(), GetAzuriteEnv()->account_key())); - return options; + bool WithHierarchicalNamespace() const { + EXPECT_OK_AND_ASSIGN(auto env, GetAzureEnv()); + return env->WithHierarchicalNamespace(); } - void TearDown() override { - AzureFileSystemTest::TearDown(); - if (HasFailure()) { - // XXX: This may not include all logs in the target test because - // Azurite doesn't flush debug logs immediately... You may want - // to check the log manually... - ARROW_IGNORE_EXPR(GetAzuriteEnv()->DumpDebugLog(debug_log_start_)); + // Tests that are called from more than one implementation of TestAzureFileSystem + + void TestDetectHierarchicalNamespace(); + void TestGetFileInfoObject(); + void TestGetFileInfoObjectWithNestedStructure(); + + void TestDeleteDirSuccessEmpty() { + auto data = SetUpPreexistingData(); + const auto directory_path = data.RandomDirectoryPath(rng_); + + if (WithHierarchicalNamespace()) { + ASSERT_OK(fs_->CreateDir(directory_path, true)); + arrow::fs::AssertFileInfo(fs_.get(), directory_path, FileType::Directory); + ASSERT_OK(fs_->DeleteDir(directory_path)); + arrow::fs::AssertFileInfo(fs_.get(), directory_path, FileType::NotFound); + } else { + // There is only virtual directory without hierarchical namespace + // support. So the CreateDir() and DeleteDir() do nothing. + ASSERT_OK(fs_->CreateDir(directory_path)); + arrow::fs::AssertFileInfo(fs_.get(), directory_path, FileType::NotFound); + ASSERT_OK(fs_->DeleteDir(directory_path)); + arrow::fs::AssertFileInfo(fs_.get(), directory_path, FileType::NotFound); } } - int64_t debug_log_start_ = 0; -}; + void TestCreateDirSuccessContainerAndDirectory() { + auto data = SetUpPreexistingData(); + const auto path = data.RandomDirectoryPath(rng_); + ASSERT_OK(fs_->CreateDir(path, false)); + if (WithHierarchicalNamespace()) { + arrow::fs::AssertFileInfo(fs_.get(), path, FileType::Directory); + } else { + // There is only virtual directory without hierarchical namespace + // support. So the CreateDir() does nothing. + arrow::fs::AssertFileInfo(fs_.get(), path, FileType::NotFound); + } + } -class AzureFlatNamespaceFileSystemTest : public AzureFileSystemTest { - Result MakeOptions() override { - AzureOptions options; - const auto account_key = std::getenv("AZURE_FLAT_NAMESPACE_ACCOUNT_KEY"); - const auto account_name = std::getenv("AZURE_FLAT_NAMESPACE_ACCOUNT_NAME"); - if (account_key && account_name) { - RETURN_NOT_OK(options.ConfigureAccountKeyCredentials(account_name, account_key)); - return options; + void TestCreateDirRecursiveSuccessContainerOnly() { + auto container_name = PreexistingData::RandomContainerName(rng_); + ASSERT_OK(fs_->CreateDir(container_name, true)); + arrow::fs::AssertFileInfo(fs_.get(), container_name, FileType::Directory); + } + + void TestCreateDirRecursiveSuccessDirectoryOnly() { + auto data = SetUpPreexistingData(); + const auto parent = data.RandomDirectoryPath(rng_); + const auto path = ConcatAbstractPath(parent, "new-sub"); + ASSERT_OK(fs_->CreateDir(path, true)); + if (WithHierarchicalNamespace()) { + arrow::fs::AssertFileInfo(fs_.get(), path, FileType::Directory); + arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::Directory); + } else { + // There is only virtual directory without hierarchical namespace + // support. So the CreateDir() does nothing. + arrow::fs::AssertFileInfo(fs_.get(), path, FileType::NotFound); + arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::NotFound); } - return Status::Cancelled( - "Connection details not provided for a real flat namespace " - "account."); } -}; -// How to enable this test: -// -// You need an Azure account. You should be able to create a free -// account at https://azure.microsoft.com/en-gb/free/ . You should be -// able to create a storage account through the portal Web UI. -// -// See also the official document how to create a storage account: -// https://learn.microsoft.com/en-us/azure/storage/blobs/create-data-lake-storage-account -// -// A few suggestions on configuration: -// -// * Use Standard general-purpose v2 not premium -// * Use LRS redundancy -// * Obviously you need to enable hierarchical namespace. -// * Set the default access tier to hot -// * SFTP, NFS and file shares are not required. -class AzureHierarchicalNamespaceFileSystemTest : public AzureFileSystemTest { - Result MakeOptions() override { - AzureOptions options; - const auto account_key = std::getenv("AZURE_HIERARCHICAL_NAMESPACE_ACCOUNT_KEY"); - const auto account_name = std::getenv("AZURE_HIERARCHICAL_NAMESPACE_ACCOUNT_NAME"); - if (account_key && account_name) { - RETURN_NOT_OK(options.ConfigureAccountKeyCredentials(account_name, account_key)); - return options; + void TestCreateDirRecursiveSuccessContainerAndDirectory() { + auto data = SetUpPreexistingData(); + const auto parent = data.RandomDirectoryPath(rng_); + const auto path = ConcatAbstractPath(parent, "new-sub"); + ASSERT_OK(fs_->CreateDir(path, true)); + if (WithHierarchicalNamespace()) { + arrow::fs::AssertFileInfo(fs_.get(), path, FileType::Directory); + arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::Directory); + arrow::fs::AssertFileInfo(fs_.get(), data.container_name, FileType::Directory); + } else { + // There is only virtual directory without hierarchical namespace + // support. So the CreateDir() does nothing. + arrow::fs::AssertFileInfo(fs_.get(), path, FileType::NotFound); + arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::NotFound); + arrow::fs::AssertFileInfo(fs_.get(), data.container_name, FileType::Directory); } - return Status::Cancelled( - "Connection details not provided for a real hierarchical namespace " - "account."); } -}; -TEST_F(AzureFlatNamespaceFileSystemTest, DetectHierarchicalNamespace) { - auto hierarchical_namespace = internal::HierarchicalNamespaceDetector(); - ASSERT_OK(hierarchical_namespace.Init(datalake_service_client_.get())); - ASSERT_OK_AND_EQ(false, hierarchical_namespace.Enabled(PreexistingContainerName())); -} + void TestDeleteDirContentsSuccessNonexistent() { + auto data = SetUpPreexistingData(); + const auto directory_path = data.RandomDirectoryPath(rng_); + ASSERT_OK(fs_->DeleteDirContents(directory_path, true)); + arrow::fs::AssertFileInfo(fs_.get(), directory_path, FileType::NotFound); + } -TEST_F(AzureHierarchicalNamespaceFileSystemTest, DetectHierarchicalNamespace) { - auto hierarchical_namespace = internal::HierarchicalNamespaceDetector(); - ASSERT_OK(hierarchical_namespace.Init(datalake_service_client_.get())); - ASSERT_OK_AND_EQ(true, hierarchical_namespace.Enabled(PreexistingContainerName())); -} + void TestDeleteDirContentsFailureNonexistent() { + auto data = SetUpPreexistingData(); + const auto directory_path = data.RandomDirectoryPath(rng_); + ASSERT_RAISES(IOError, fs_->DeleteDirContents(directory_path, false)); + } +}; -TEST_F(AzuriteFileSystemTest, DetectHierarchicalNamespace) { - auto hierarchical_namespace = internal::HierarchicalNamespaceDetector(); - ASSERT_OK(hierarchical_namespace.Init(datalake_service_client_.get())); - ASSERT_OK_AND_EQ(false, hierarchical_namespace.Enabled(PreexistingContainerName())); -} +void TestAzureFileSystem::TestDetectHierarchicalNamespace() { + // Check the environments are implemented and injected here correctly. + auto expected = WithHierarchicalNamespace(); -TEST_F(AzuriteFileSystemTest, DetectHierarchicalNamespaceFailsWithMissingContainer) { + auto data = SetUpPreexistingData(); auto hierarchical_namespace = internal::HierarchicalNamespaceDetector(); ASSERT_OK(hierarchical_namespace.Init(datalake_service_client_.get())); - ASSERT_NOT_OK(hierarchical_namespace.Enabled("nonexistent-container")); + ASSERT_OK_AND_EQ(expected, hierarchical_namespace.Enabled(data.container_name)); } -TEST_F(AzuriteFileSystemTest, GetFileInfoAccount) { - AssertFileInfo(fs_.get(), "", FileType::Directory); - - // URI - ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://")); -} - -TEST_F(AzuriteFileSystemTest, GetFileInfoContainer) { - AssertFileInfo(fs_.get(), PreexistingContainerName(), FileType::Directory); +void TestAzureFileSystem::TestGetFileInfoObject() { + auto data = SetUpPreexistingData(); + auto object_properties = + blob_service_client_->GetBlobContainerClient(data.container_name) + .GetBlobClient(data.kObjectName) + .GetProperties() + .Value; - AssertFileInfo(fs_.get(), "nonexistent-container", FileType::NotFound); + AssertFileInfo(fs_.get(), data.ObjectPath(), FileType::File, + std::chrono::system_clock::time_point{object_properties.LastModified}, + static_cast(object_properties.BlobSize)); // URI - ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://" + PreexistingContainerName())); + ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://" + std::string{data.kObjectName})); } -void AzureFileSystemTest::RunGetFileInfoObjectWithNestedStructureTest() { +void TestAzureFileSystem::TestGetFileInfoObjectWithNestedStructure() { + auto data = SetUpPreexistingData(); // Adds detailed tests to handle cases of different edge cases // with directory naming conventions (e.g. with and without slashes). - constexpr auto kObjectName = "test-object-dir/some_other_dir/another_dir/foo"; - ASSERT_OK_AND_ASSIGN( - auto output, - fs_->OpenOutputStream(PreexistingContainerPath() + kObjectName, /*metadata=*/{})); - const std::string_view data(kLoremIpsum); - ASSERT_OK(output->Write(data)); + const std::string kObjectName = "test-object-dir/some_other_dir/another_dir/foo"; + ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(data.ContainerPath(kObjectName), + /*metadata=*/{})); + const std::string_view lorem_ipsum(PreexistingData::kLoremIpsum); + ASSERT_OK(output->Write(lorem_ipsum)); ASSERT_OK(output->Close()); // 0 is immediately after "/" lexicographically, ensure that this doesn't // cause unexpected issues. - ASSERT_OK_AND_ASSIGN(output, - fs_->OpenOutputStream( - PreexistingContainerPath() + "test-object-dir/some_other_dir0", - /*metadata=*/{})); - ASSERT_OK(output->Write(data)); - ASSERT_OK(output->Close()); ASSERT_OK_AND_ASSIGN( - output, fs_->OpenOutputStream(PreexistingContainerPath() + kObjectName + "0", + output, fs_->OpenOutputStream(data.ContainerPath("test-object-dir/some_other_dir0"), /*metadata=*/{})); - ASSERT_OK(output->Write(data)); + ASSERT_OK(output->Write(lorem_ipsum)); + ASSERT_OK(output->Close()); + ASSERT_OK_AND_ASSIGN(output, + fs_->OpenOutputStream(data.ContainerPath(kObjectName + "0"), + /*metadata=*/{})); + ASSERT_OK(output->Write(lorem_ipsum)); ASSERT_OK(output->Close()); - AssertFileInfo(fs_.get(), PreexistingContainerPath() + kObjectName, FileType::File); - AssertFileInfo(fs_.get(), PreexistingContainerPath() + kObjectName + "/", - FileType::NotFound); - AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-object-dir", - FileType::Directory); - AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-object-dir/", + AssertFileInfo(fs_.get(), data.ContainerPath(kObjectName), FileType::File); + AssertFileInfo(fs_.get(), data.ContainerPath(kObjectName) + "/", FileType::NotFound); + AssertFileInfo(fs_.get(), data.ContainerPath("test-object-dir"), FileType::Directory); + AssertFileInfo(fs_.get(), data.ContainerPath("test-object-dir") + "/", FileType::Directory); - AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-object-dir/some_other_dir", + AssertFileInfo(fs_.get(), data.ContainerPath("test-object-dir/some_other_dir"), FileType::Directory); - AssertFileInfo(fs_.get(), - PreexistingContainerPath() + "test-object-dir/some_other_dir/", + AssertFileInfo(fs_.get(), data.ContainerPath("test-object-dir/some_other_dir") + "/", FileType::Directory); - AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-object-di", - FileType::NotFound); - AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-object-dir/some_other_di", + AssertFileInfo(fs_.get(), data.ContainerPath("test-object-di"), FileType::NotFound); + AssertFileInfo(fs_.get(), data.ContainerPath("test-object-dir/some_other_di"), FileType::NotFound); + + if (WithHierarchicalNamespace()) { + datalake_service_client_->GetFileSystemClient(data.container_name) + .GetDirectoryClient("test-empty-object-dir") + .Create(); + + AssertFileInfo(fs_.get(), data.ContainerPath("test-empty-object-dir"), + FileType::Directory); + } } -TEST_F(AzuriteFileSystemTest, GetFileInfoObjectWithNestedStructure) { - RunGetFileInfoObjectWithNestedStructureTest(); +template +class AzureFileSystemTestImpl : public TestAzureFileSystem { + public: + using TestAzureFileSystem::TestAzureFileSystem; + + Result GetAzureEnv() const final { return AzureEnvClass::GetInstance(); } +}; + +// How to enable the non-Azurite tests: +// +// You need an Azure account. You should be able to create a free account [1]. +// Through the portal Web UI, you should create a storage account [2]. +// +// A few suggestions on configuration: +// +// * Use Standard general-purpose v2 not premium +// * Use LRS redundancy +// * Set the default access tier to hot +// * SFTP, NFS and file shares are not required. +// +// You must not enable Hierarchical Namespace on the storage account used for +// TestAzureFlatNSFileSystem, but you must enable it on the storage account +// used for TestAzureHierarchicalNSFileSystem. +// +// The credentials should be placed in the correct environment variables: +// +// * AZURE_FLAT_NAMESPACE_ACCOUNT_NAME +// * AZURE_FLAT_NAMESPACE_ACCOUNT_KEY +// * AZURE_HIERARCHICAL_NAMESPACE_ACCOUNT_NAME +// * AZURE_HIERARCHICAL_NAMESPACE_ACCOUNT_KEY +// +// [1]: https://azure.microsoft.com/en-gb/free/ +// [2]: +// https://learn.microsoft.com/en-us/azure/storage/blobs/create-data-lake-storage-account +using TestAzureFlatNSFileSystem = AzureFileSystemTestImpl; +using TestAzureHierarchicalNSFileSystem = AzureFileSystemTestImpl; +using TestAzuriteFileSystem = AzureFileSystemTestImpl; + +// Tests using all the 3 environments (Azurite, Azure w/o HNS (flat), Azure w/ HNS) + +template +using AzureFileSystemTestOnAllEnvs = AzureFileSystemTestImpl; + +using AllEnvironments = + ::testing::Types; + +TYPED_TEST_SUITE(AzureFileSystemTestOnAllEnvs, AllEnvironments); + +TYPED_TEST(AzureFileSystemTestOnAllEnvs, DetectHierarchicalNamespace) { + this->TestDetectHierarchicalNamespace(); } -TEST_F(AzureHierarchicalNamespaceFileSystemTest, GetFileInfoObjectWithNestedStructure) { - RunGetFileInfoObjectWithNestedStructureTest(); - datalake_service_client_->GetFileSystemClient(PreexistingContainerName()) - .GetDirectoryClient("test-empty-object-dir") - .Create(); +TYPED_TEST(AzureFileSystemTestOnAllEnvs, GetFileInfoObject) { + this->TestGetFileInfoObject(); +} - AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-empty-object-dir", - FileType::Directory); +TYPED_TEST(AzureFileSystemTestOnAllEnvs, DeleteDirSuccessEmpty) { + this->TestDeleteDirSuccessEmpty(); } -void AzureFileSystemTest::RunGetFileInfoObjectTest() { - auto object_properties = - blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) - .GetBlobClient(PreexistingObjectName()) - .GetProperties() - .Value; +TYPED_TEST(AzureFileSystemTestOnAllEnvs, GetFileInfoObjectWithNestedStructure) { + this->TestGetFileInfoObjectWithNestedStructure(); +} - AssertFileInfo(fs_.get(), PreexistingObjectPath(), FileType::File, - std::chrono::system_clock::time_point(object_properties.LastModified), - static_cast(object_properties.BlobSize)); +TYPED_TEST(AzureFileSystemTestOnAllEnvs, CreateDirSuccessContainerAndDirectory) { + this->TestCreateDirSuccessContainerAndDirectory(); +} + +TYPED_TEST(AzureFileSystemTestOnAllEnvs, CreateDirRecursiveSuccessContainerOnly) { + this->TestCreateDirRecursiveSuccessContainerOnly(); +} + +TYPED_TEST(AzureFileSystemTestOnAllEnvs, CreateDirRecursiveSuccessDirectoryOnly) { + this->TestCreateDirRecursiveSuccessDirectoryOnly(); +} + +TYPED_TEST(AzureFileSystemTestOnAllEnvs, CreateDirRecursiveSuccessContainerAndDirectory) { + this->TestCreateDirRecursiveSuccessContainerAndDirectory(); +} + +// Tests using a real storage account *with Hierarchical Namespace enabled* + +TEST_F(TestAzureHierarchicalNSFileSystem, DeleteDirFailureNonexistent) { + auto data = SetUpPreexistingData(); + const auto path = data.RandomDirectoryPath(rng_); + ASSERT_RAISES(IOError, fs_->DeleteDir(path)); +} + +TEST_F(TestAzureHierarchicalNSFileSystem, DeleteDirSuccessHaveBlob) { + auto data = SetUpPreexistingData(); + const auto directory_path = data.RandomDirectoryPath(rng_); + const auto blob_path = ConcatAbstractPath(directory_path, "hello.txt"); + ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(blob_path)); + ASSERT_OK(output->Write(std::string_view("hello"))); + ASSERT_OK(output->Close()); + arrow::fs::AssertFileInfo(fs_.get(), blob_path, FileType::File); + ASSERT_OK(fs_->DeleteDir(directory_path)); + arrow::fs::AssertFileInfo(fs_.get(), blob_path, FileType::NotFound); +} + +TEST_F(TestAzureHierarchicalNSFileSystem, DeleteDirSuccessHaveDirectory) { + auto data = SetUpPreexistingData(); + const auto parent = data.RandomDirectoryPath(rng_); + const auto path = ConcatAbstractPath(parent, "new-sub"); + ASSERT_OK(fs_->CreateDir(path, true)); + arrow::fs::AssertFileInfo(fs_.get(), path, FileType::Directory); + arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::Directory); + ASSERT_OK(fs_->DeleteDir(parent)); + arrow::fs::AssertFileInfo(fs_.get(), path, FileType::NotFound); + arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::NotFound); +} + +TEST_F(TestAzureHierarchicalNSFileSystem, DeleteDirContentsSuccessExist) { + auto preexisting_data = SetUpPreexistingData(); + HierarchicalPaths paths; + CreateHierarchicalData(&paths); + ASSERT_OK(fs_->DeleteDirContents(paths.directory)); + arrow::fs::AssertFileInfo(fs_.get(), paths.directory, FileType::Directory); + for (const auto& sub_path : paths.sub_paths) { + arrow::fs::AssertFileInfo(fs_.get(), sub_path, FileType::NotFound); + } +} + +TEST_F(TestAzureHierarchicalNSFileSystem, DeleteDirContentsSuccessNonexistent) { + this->TestDeleteDirContentsSuccessNonexistent(); +} + +TEST_F(TestAzureHierarchicalNSFileSystem, DeleteDirContentsFailureNonexistent) { + this->TestDeleteDirContentsFailureNonexistent(); +} + +// Tests using Azurite (the local Azure emulator) + +TEST_F(TestAzuriteFileSystem, DetectHierarchicalNamespaceFailsWithMissingContainer) { + auto hierarchical_namespace = internal::HierarchicalNamespaceDetector(); + ASSERT_OK(hierarchical_namespace.Init(datalake_service_client_.get())); + ASSERT_RAISES(IOError, hierarchical_namespace.Enabled("nonexistent-container")); +} + +TEST_F(TestAzuriteFileSystem, GetFileInfoAccount) { + AssertFileInfo(fs_.get(), "", FileType::Directory); // URI - ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://" + PreexistingObjectName())); + ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://")); } -TEST_F(AzuriteFileSystemTest, GetFileInfoObject) { RunGetFileInfoObjectTest(); } +TEST_F(TestAzuriteFileSystem, GetFileInfoContainer) { + auto data = SetUpPreexistingData(); + AssertFileInfo(fs_.get(), data.container_name, FileType::Directory); -TEST_F(AzureHierarchicalNamespaceFileSystemTest, GetFileInfoObject) { - RunGetFileInfoObjectTest(); + AssertFileInfo(fs_.get(), "nonexistent-container", FileType::NotFound); + + // URI + ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://" + data.container_name)); } -TEST_F(AzuriteFileSystemTest, GetFileInfoSelector) { +TEST_F(TestAzuriteFileSystem, GetFileInfoSelector) { SetUpSmallFileSystemTree(); FileSelector select; @@ -581,11 +840,10 @@ TEST_F(AzuriteFileSystemTest, GetFileInfoSelector) { // Root dir select.base_dir = ""; ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); - ASSERT_EQ(infos.size(), 3); + ASSERT_EQ(infos.size(), 2); ASSERT_EQ(infos, SortedInfos(infos)); AssertFileInfo(infos[0], "container", FileType::Directory); AssertFileInfo(infos[1], "empty-container", FileType::Directory); - AssertFileInfo(infos[2], container_name_, FileType::Directory); // Empty container select.base_dir = "empty-container"; @@ -641,7 +899,7 @@ TEST_F(AzuriteFileSystemTest, GetFileInfoSelector) { ASSERT_EQ(infos.size(), 4); } -TEST_F(AzuriteFileSystemTest, GetFileInfoSelectorRecursive) { +TEST_F(TestAzuriteFileSystem, GetFileInfoSelectorRecursive) { SetUpSmallFileSystemTree(); FileSelector select; @@ -651,7 +909,7 @@ TEST_F(AzuriteFileSystemTest, GetFileInfoSelectorRecursive) { // Root dir select.base_dir = ""; ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); - ASSERT_EQ(infos.size(), 14); + ASSERT_EQ(infos.size(), 12); ASSERT_EQ(infos, SortedInfos(infos)); AssertInfoAllContainersRecursive(infos); @@ -699,7 +957,7 @@ TEST_F(AzuriteFileSystemTest, GetFileInfoSelectorRecursive) { AssertFileInfo(infos[3], "container/otherdir/1/2/3/otherfile", FileType::File, 10); } -TEST_F(AzuriteFileSystemTest, GetFileInfoSelectorExplicitImplicitDirDedup) { +TEST_F(TestAzuriteFileSystem, GetFileInfoSelectorExplicitImplicitDirDedup) { { auto container = CreateContainer("container"); CreateBlob(container, "mydir/emptydir1/"); @@ -746,137 +1004,60 @@ TEST_F(AzuriteFileSystemTest, GetFileInfoSelectorExplicitImplicitDirDedup) { AssertFileInfo(infos[0], "container/mydir/nonemptydir2/somefile", FileType::File); } -TEST_F(AzuriteFileSystemTest, CreateDirFailureNoContainer) { +TEST_F(TestAzuriteFileSystem, CreateDirFailureNoContainer) { ASSERT_RAISES(Invalid, fs_->CreateDir("", false)); } -TEST_F(AzuriteFileSystemTest, CreateDirSuccessContainerOnly) { - auto container_name = RandomContainerName(); +TEST_F(TestAzuriteFileSystem, CreateDirSuccessContainerOnly) { + auto container_name = PreexistingData::RandomContainerName(rng_); ASSERT_OK(fs_->CreateDir(container_name, false)); arrow::fs::AssertFileInfo(fs_.get(), container_name, FileType::Directory); } -TEST_F(AzuriteFileSystemTest, CreateDirSuccessContainerAndDirectory) { - const auto path = PreexistingContainerPath() + RandomDirectoryName(); - ASSERT_OK(fs_->CreateDir(path, false)); - // There is only virtual directory without hierarchical namespace - // support. So the CreateDir() does nothing. - arrow::fs::AssertFileInfo(fs_.get(), path, FileType::NotFound); -} - -TEST_F(AzureHierarchicalNamespaceFileSystemTest, CreateDirSuccessContainerAndDirectory) { - const auto path = PreexistingContainerPath() + RandomDirectoryName(); - ASSERT_OK(fs_->CreateDir(path, false)); - arrow::fs::AssertFileInfo(fs_.get(), path, FileType::Directory); -} - -TEST_F(AzuriteFileSystemTest, CreateDirFailureDirectoryWithMissingContainer) { +TEST_F(TestAzuriteFileSystem, CreateDirFailureDirectoryWithMissingContainer) { const auto path = std::string("not-a-container/new-directory"); ASSERT_RAISES(IOError, fs_->CreateDir(path, false)); } -TEST_F(AzuriteFileSystemTest, CreateDirRecursiveFailureNoContainer) { +TEST_F(TestAzuriteFileSystem, CreateDirRecursiveFailureNoContainer) { ASSERT_RAISES(Invalid, fs_->CreateDir("", true)); } -TEST_F(AzureHierarchicalNamespaceFileSystemTest, CreateDirRecursiveSuccessContainerOnly) { - auto container_name = RandomContainerName(); - ASSERT_OK(fs_->CreateDir(container_name, true)); - arrow::fs::AssertFileInfo(fs_.get(), container_name, FileType::Directory); -} - -TEST_F(AzuriteFileSystemTest, CreateDirRecursiveSuccessContainerOnly) { - auto container_name = RandomContainerName(); - ASSERT_OK(fs_->CreateDir(container_name, true)); - arrow::fs::AssertFileInfo(fs_.get(), container_name, FileType::Directory); -} - -TEST_F(AzureHierarchicalNamespaceFileSystemTest, CreateDirRecursiveSuccessDirectoryOnly) { - const auto parent = PreexistingContainerPath() + RandomDirectoryName(); - const auto path = internal::ConcatAbstractPath(parent, "new-sub"); - ASSERT_OK(fs_->CreateDir(path, true)); - arrow::fs::AssertFileInfo(fs_.get(), path, FileType::Directory); - arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::Directory); -} - -TEST_F(AzuriteFileSystemTest, CreateDirRecursiveSuccessDirectoryOnly) { - const auto parent = PreexistingContainerPath() + RandomDirectoryName(); - const auto path = internal::ConcatAbstractPath(parent, "new-sub"); - ASSERT_OK(fs_->CreateDir(path, true)); - // There is only virtual directory without hierarchical namespace - // support. So the CreateDir() does nothing. - arrow::fs::AssertFileInfo(fs_.get(), path, FileType::NotFound); - arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::NotFound); -} - -TEST_F(AzureHierarchicalNamespaceFileSystemTest, - CreateDirRecursiveSuccessContainerAndDirectory) { - auto container_name = RandomContainerName(); - const auto parent = internal::ConcatAbstractPath(container_name, RandomDirectoryName()); - const auto path = internal::ConcatAbstractPath(parent, "new-sub"); - ASSERT_OK(fs_->CreateDir(path, true)); - arrow::fs::AssertFileInfo(fs_.get(), path, FileType::Directory); - arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::Directory); - arrow::fs::AssertFileInfo(fs_.get(), container_name, FileType::Directory); +TEST_F(TestAzuriteFileSystem, CreateDirUri) { + ASSERT_RAISES( + Invalid, + fs_->CreateDir("abfs://" + PreexistingData::RandomContainerName(rng_), true)); } -TEST_F(AzuriteFileSystemTest, CreateDirRecursiveSuccessContainerAndDirectory) { - auto container_name = RandomContainerName(); - const auto parent = internal::ConcatAbstractPath(container_name, RandomDirectoryName()); - const auto path = internal::ConcatAbstractPath(parent, "new-sub"); - ASSERT_OK(fs_->CreateDir(path, true)); - // There is only virtual directory without hierarchical namespace - // support. So the CreateDir() does nothing. - arrow::fs::AssertFileInfo(fs_.get(), path, FileType::NotFound); - arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::NotFound); - arrow::fs::AssertFileInfo(fs_.get(), container_name, FileType::Directory); -} - -TEST_F(AzuriteFileSystemTest, CreateDirUri) { - ASSERT_RAISES(Invalid, fs_->CreateDir("abfs://" + RandomContainerName(), true)); -} - -TEST_F(AzuriteFileSystemTest, DeleteDirSuccessContainer) { - const auto container_name = RandomContainerName(); +TEST_F(TestAzuriteFileSystem, DeleteDirSuccessContainer) { + const auto container_name = PreexistingData::RandomContainerName(rng_); ASSERT_OK(fs_->CreateDir(container_name)); arrow::fs::AssertFileInfo(fs_.get(), container_name, FileType::Directory); ASSERT_OK(fs_->DeleteDir(container_name)); arrow::fs::AssertFileInfo(fs_.get(), container_name, FileType::NotFound); } -TEST_F(AzuriteFileSystemTest, DeleteDirSuccessEmpty) { - const auto directory_path = - internal::ConcatAbstractPath(PreexistingContainerName(), RandomDirectoryName()); - // There is only virtual directory without hierarchical namespace - // support. So the CreateDir() and DeleteDir() do nothing. - ASSERT_OK(fs_->CreateDir(directory_path)); - arrow::fs::AssertFileInfo(fs_.get(), directory_path, FileType::NotFound); - ASSERT_OK(fs_->DeleteDir(directory_path)); - arrow::fs::AssertFileInfo(fs_.get(), directory_path, FileType::NotFound); -} - -TEST_F(AzuriteFileSystemTest, DeleteDirSuccessNonexistent) { - const auto directory_path = - internal::ConcatAbstractPath(PreexistingContainerName(), RandomDirectoryName()); +TEST_F(TestAzuriteFileSystem, DeleteDirSuccessNonexistent) { + auto data = SetUpPreexistingData(); + const auto directory_path = data.RandomDirectoryPath(rng_); // There is only virtual directory without hierarchical namespace // support. So the DeleteDir() for nonexistent directory does nothing. ASSERT_OK(fs_->DeleteDir(directory_path)); arrow::fs::AssertFileInfo(fs_.get(), directory_path, FileType::NotFound); } -TEST_F(AzuriteFileSystemTest, DeleteDirSuccessHaveBlobs) { +TEST_F(TestAzuriteFileSystem, DeleteDirSuccessHaveBlobs) { #ifdef __APPLE__ GTEST_SKIP() << "This test fails by an Azurite problem: " "https://github.com/Azure/Azurite/pull/2302"; #endif - const auto directory_path = - internal::ConcatAbstractPath(PreexistingContainerName(), RandomDirectoryName()); + auto data = SetUpPreexistingData(); + const auto directory_path = data.RandomDirectoryPath(rng_); // We must use 257 or more blobs here to test pagination of ListBlobs(). // Because we can't add 257 or more delete blob requests to one SubmitBatch(). int64_t n_blobs = 257; for (int64_t i = 0; i < n_blobs; ++i) { - const auto blob_path = - internal::ConcatAbstractPath(directory_path, std::to_string(i) + ".txt"); + const auto blob_path = ConcatAbstractPath(directory_path, std::to_string(i) + ".txt"); ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(blob_path)); ASSERT_OK(output->Write(std::string_view(std::to_string(i)))); ASSERT_OK(output->Close()); @@ -884,62 +1065,24 @@ TEST_F(AzuriteFileSystemTest, DeleteDirSuccessHaveBlobs) { } ASSERT_OK(fs_->DeleteDir(directory_path)); for (int64_t i = 0; i < n_blobs; ++i) { - const auto blob_path = - internal::ConcatAbstractPath(directory_path, std::to_string(i) + ".txt"); + const auto blob_path = ConcatAbstractPath(directory_path, std::to_string(i) + ".txt"); arrow::fs::AssertFileInfo(fs_.get(), blob_path, FileType::NotFound); } } -TEST_F(AzureHierarchicalNamespaceFileSystemTest, DeleteDirSuccessEmpty) { - const auto directory_path = - internal::ConcatAbstractPath(PreexistingContainerName(), RandomDirectoryName()); - ASSERT_OK(fs_->CreateDir(directory_path, true)); - arrow::fs::AssertFileInfo(fs_.get(), directory_path, FileType::Directory); - ASSERT_OK(fs_->DeleteDir(directory_path)); - arrow::fs::AssertFileInfo(fs_.get(), directory_path, FileType::NotFound); -} - -TEST_F(AzureHierarchicalNamespaceFileSystemTest, DeleteDirFailureNonexistent) { - const auto path = - internal::ConcatAbstractPath(PreexistingContainerName(), RandomDirectoryName()); - ASSERT_RAISES(IOError, fs_->DeleteDir(path)); +TEST_F(TestAzuriteFileSystem, DeleteDirUri) { + auto data = SetUpPreexistingData(); + ASSERT_RAISES(Invalid, fs_->DeleteDir("abfs://" + data.container_name + "/")); } -TEST_F(AzureHierarchicalNamespaceFileSystemTest, DeleteDirSuccessHaveBlob) { - const auto directory_path = - internal::ConcatAbstractPath(PreexistingContainerName(), RandomDirectoryName()); - const auto blob_path = internal::ConcatAbstractPath(directory_path, "hello.txt"); - ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(blob_path)); - ASSERT_OK(output->Write(std::string_view("hello"))); - ASSERT_OK(output->Close()); - arrow::fs::AssertFileInfo(fs_.get(), blob_path, FileType::File); - ASSERT_OK(fs_->DeleteDir(directory_path)); - arrow::fs::AssertFileInfo(fs_.get(), blob_path, FileType::NotFound); -} - -TEST_F(AzureHierarchicalNamespaceFileSystemTest, DeleteDirSuccessHaveDirectory) { - const auto parent = - internal::ConcatAbstractPath(PreexistingContainerName(), RandomDirectoryName()); - const auto path = internal::ConcatAbstractPath(parent, "new-sub"); - ASSERT_OK(fs_->CreateDir(path, true)); - arrow::fs::AssertFileInfo(fs_.get(), path, FileType::Directory); - arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::Directory); - ASSERT_OK(fs_->DeleteDir(parent)); - arrow::fs::AssertFileInfo(fs_.get(), path, FileType::NotFound); - arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::NotFound); -} - -TEST_F(AzuriteFileSystemTest, DeleteDirUri) { - ASSERT_RAISES(Invalid, fs_->DeleteDir("abfs://" + PreexistingContainerPath())); -} - -TEST_F(AzuriteFileSystemTest, DeleteDirContentsSuccessContainer) { +TEST_F(TestAzuriteFileSystem, DeleteDirContentsSuccessContainer) { #ifdef __APPLE__ GTEST_SKIP() << "This test fails by an Azurite problem: " "https://github.com/Azure/Azurite/pull/2302"; #endif + auto data = SetUpPreexistingData(); HierarchicalPaths paths; - CreateHierarchicalData(paths); + CreateHierarchicalData(&paths); ASSERT_OK(fs_->DeleteDirContents(paths.container)); arrow::fs::AssertFileInfo(fs_.get(), paths.container, FileType::Directory); arrow::fs::AssertFileInfo(fs_.get(), paths.directory, FileType::NotFound); @@ -948,13 +1091,14 @@ TEST_F(AzuriteFileSystemTest, DeleteDirContentsSuccessContainer) { } } -TEST_F(AzuriteFileSystemTest, DeleteDirContentsSuccessDirectory) { +TEST_F(TestAzuriteFileSystem, DeleteDirContentsSuccessDirectory) { #ifdef __APPLE__ GTEST_SKIP() << "This test fails by an Azurite problem: " "https://github.com/Azure/Azurite/pull/2302"; #endif + auto data = SetUpPreexistingData(); HierarchicalPaths paths; - CreateHierarchicalData(paths); + CreateHierarchicalData(&paths); ASSERT_OK(fs_->DeleteDirContents(paths.directory)); // GH-38772: We may change this to FileType::Directory. arrow::fs::AssertFileInfo(fs_.get(), paths.directory, FileType::NotFound); @@ -963,98 +1107,72 @@ TEST_F(AzuriteFileSystemTest, DeleteDirContentsSuccessDirectory) { } } -TEST_F(AzuriteFileSystemTest, DeleteDirContentsSuccessNonexistent) { - const auto directory_path = - internal::ConcatAbstractPath(PreexistingContainerName(), RandomDirectoryName()); - ASSERT_OK(fs_->DeleteDirContents(directory_path, true)); - arrow::fs::AssertFileInfo(fs_.get(), directory_path, FileType::NotFound); -} - -TEST_F(AzuriteFileSystemTest, DeleteDirContentsFailureNonexistent) { - const auto directory_path = - internal::ConcatAbstractPath(PreexistingContainerName(), RandomDirectoryName()); - ASSERT_RAISES(IOError, fs_->DeleteDirContents(directory_path, false)); -} - -TEST_F(AzureHierarchicalNamespaceFileSystemTest, DeleteDirContentsSuccessExist) { - HierarchicalPaths paths; - CreateHierarchicalData(paths); - ASSERT_OK(fs_->DeleteDirContents(paths.directory)); - arrow::fs::AssertFileInfo(fs_.get(), paths.directory, FileType::Directory); - for (const auto& sub_path : paths.sub_paths) { - arrow::fs::AssertFileInfo(fs_.get(), sub_path, FileType::NotFound); - } -} - -TEST_F(AzureHierarchicalNamespaceFileSystemTest, DeleteDirContentsSuccessNonexistent) { - const auto directory_path = - internal::ConcatAbstractPath(PreexistingContainerName(), RandomDirectoryName()); - ASSERT_OK(fs_->DeleteDirContents(directory_path, true)); - arrow::fs::AssertFileInfo(fs_.get(), directory_path, FileType::NotFound); +TEST_F(TestAzuriteFileSystem, DeleteDirContentsSuccessNonexistent) { + this->TestDeleteDirContentsSuccessNonexistent(); } -TEST_F(AzureHierarchicalNamespaceFileSystemTest, DeleteDirContentsFailureNonexistent) { - const auto directory_path = - internal::ConcatAbstractPath(PreexistingContainerName(), RandomDirectoryName()); - ASSERT_RAISES(IOError, fs_->DeleteDirContents(directory_path, false)); +TEST_F(TestAzuriteFileSystem, DeleteDirContentsFailureNonexistent) { + this->TestDeleteDirContentsFailureNonexistent(); } -TEST_F(AzuriteFileSystemTest, CopyFileSuccessDestinationNonexistent) { - const auto destination_path = - internal::ConcatAbstractPath(PreexistingContainerName(), "copy-destionation"); - ASSERT_OK(fs_->CopyFile(PreexistingObjectPath(), destination_path)); +TEST_F(TestAzuriteFileSystem, CopyFileSuccessDestinationNonexistent) { + auto data = SetUpPreexistingData(); + const auto destination_path = data.ContainerPath("copy-destionation"); + ASSERT_OK(fs_->CopyFile(data.ObjectPath(), destination_path)); ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(destination_path)); ASSERT_OK_AND_ASSIGN(auto stream, fs_->OpenInputStream(info)); ASSERT_OK_AND_ASSIGN(auto buffer, stream->Read(1024)); - EXPECT_EQ(kLoremIpsum, buffer->ToString()); + EXPECT_EQ(PreexistingData::kLoremIpsum, buffer->ToString()); } -TEST_F(AzuriteFileSystemTest, CopyFileSuccessDestinationSame) { - ASSERT_OK(fs_->CopyFile(PreexistingObjectPath(), PreexistingObjectPath())); - ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(PreexistingObjectPath())); +TEST_F(TestAzuriteFileSystem, CopyFileSuccessDestinationSame) { + auto data = SetUpPreexistingData(); + ASSERT_OK(fs_->CopyFile(data.ObjectPath(), data.ObjectPath())); + ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(data.ObjectPath())); ASSERT_OK_AND_ASSIGN(auto stream, fs_->OpenInputStream(info)); ASSERT_OK_AND_ASSIGN(auto buffer, stream->Read(1024)); - EXPECT_EQ(kLoremIpsum, buffer->ToString()); + EXPECT_EQ(PreexistingData::kLoremIpsum, buffer->ToString()); } -TEST_F(AzuriteFileSystemTest, CopyFileFailureDestinationTrailingSlash) { - ASSERT_RAISES(IOError, - fs_->CopyFile(PreexistingObjectPath(), - internal::EnsureTrailingSlash(PreexistingObjectPath()))); +TEST_F(TestAzuriteFileSystem, CopyFileFailureDestinationTrailingSlash) { + auto data = SetUpPreexistingData(); + ASSERT_RAISES(IOError, fs_->CopyFile(data.ObjectPath(), + internal::EnsureTrailingSlash(data.ObjectPath()))); } -TEST_F(AzuriteFileSystemTest, CopyFileFailureSourceNonexistent) { - const auto destination_path = - internal::ConcatAbstractPath(PreexistingContainerName(), "copy-destionation"); - ASSERT_RAISES(IOError, fs_->CopyFile(NotFoundObjectPath(), destination_path)); +TEST_F(TestAzuriteFileSystem, CopyFileFailureSourceNonexistent) { + auto data = SetUpPreexistingData(); + const auto destination_path = data.ContainerPath("copy-destionation"); + ASSERT_RAISES(IOError, fs_->CopyFile(data.NotFoundObjectPath(), destination_path)); } -TEST_F(AzuriteFileSystemTest, CopyFileFailureDestinationParentNonexistent) { +TEST_F(TestAzuriteFileSystem, CopyFileFailureDestinationParentNonexistent) { + auto data = SetUpPreexistingData(); const auto destination_path = - internal::ConcatAbstractPath(RandomContainerName(), "copy-destionation"); - ASSERT_RAISES(IOError, fs_->CopyFile(PreexistingObjectPath(), destination_path)); + ConcatAbstractPath(PreexistingData::RandomContainerName(rng_), "copy-destionation"); + ASSERT_RAISES(IOError, fs_->CopyFile(data.ObjectPath(), destination_path)); } -TEST_F(AzuriteFileSystemTest, CopyFileUri) { - const auto destination_path = - internal::ConcatAbstractPath(PreexistingContainerName(), "copy-destionation"); - ASSERT_RAISES(Invalid, - fs_->CopyFile("abfs://" + PreexistingObjectPath(), destination_path)); - ASSERT_RAISES(Invalid, - fs_->CopyFile(PreexistingObjectPath(), "abfs://" + destination_path)); +TEST_F(TestAzuriteFileSystem, CopyFileUri) { + auto data = SetUpPreexistingData(); + const auto destination_path = data.ContainerPath("copy-destionation"); + ASSERT_RAISES(Invalid, fs_->CopyFile("abfs://" + data.ObjectPath(), destination_path)); + ASSERT_RAISES(Invalid, fs_->CopyFile(data.ObjectPath(), "abfs://" + destination_path)); } -TEST_F(AzuriteFileSystemTest, OpenInputStreamString) { +TEST_F(TestAzuriteFileSystem, OpenInputStreamString) { + auto data = SetUpPreexistingData(); std::shared_ptr stream; - ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(PreexistingObjectPath())); + ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(data.ObjectPath())); ASSERT_OK_AND_ASSIGN(auto buffer, stream->Read(1024)); - EXPECT_EQ(buffer->ToString(), kLoremIpsum); + EXPECT_EQ(buffer->ToString(), PreexistingData::kLoremIpsum); } -TEST_F(AzuriteFileSystemTest, OpenInputStreamStringBuffers) { +TEST_F(TestAzuriteFileSystem, OpenInputStreamStringBuffers) { + auto data = SetUpPreexistingData(); std::shared_ptr stream; - ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(PreexistingObjectPath())); + ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(data.ObjectPath())); std::string contents; std::shared_ptr buffer; @@ -1063,23 +1181,25 @@ TEST_F(AzuriteFileSystemTest, OpenInputStreamStringBuffers) { contents.append(buffer->ToString()); } while (buffer && buffer->size() != 0); - EXPECT_EQ(contents, kLoremIpsum); + EXPECT_EQ(contents, PreexistingData::kLoremIpsum); } -TEST_F(AzuriteFileSystemTest, OpenInputStreamInfo) { - ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(PreexistingObjectPath())); +TEST_F(TestAzuriteFileSystem, OpenInputStreamInfo) { + auto data = SetUpPreexistingData(); + ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(data.ObjectPath())); std::shared_ptr stream; ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(info)); ASSERT_OK_AND_ASSIGN(auto buffer, stream->Read(1024)); - EXPECT_EQ(buffer->ToString(), kLoremIpsum); + EXPECT_EQ(buffer->ToString(), PreexistingData::kLoremIpsum); } -TEST_F(AzuriteFileSystemTest, OpenInputStreamEmpty) { +TEST_F(TestAzuriteFileSystem, OpenInputStreamEmpty) { + auto data = SetUpPreexistingData(); const auto path_to_file = "empty-object.txt"; - const auto path = PreexistingContainerPath() + path_to_file; - blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) + const auto path = data.ContainerPath(path_to_file); + blob_service_client_->GetBlobContainerClient(data.container_name) .GetBlockBlobClient(path_to_file) .UploadFrom(nullptr, 0); @@ -1090,24 +1210,28 @@ TEST_F(AzuriteFileSystemTest, OpenInputStreamEmpty) { EXPECT_EQ(size, 0); } -TEST_F(AzuriteFileSystemTest, OpenInputStreamNotFound) { - ASSERT_RAISES(IOError, fs_->OpenInputStream(NotFoundObjectPath())); +TEST_F(TestAzuriteFileSystem, OpenInputStreamNotFound) { + auto data = SetUpPreexistingData(); + ASSERT_RAISES(IOError, fs_->OpenInputStream(data.NotFoundObjectPath())); } -TEST_F(AzuriteFileSystemTest, OpenInputStreamInfoInvalid) { - ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(PreexistingContainerPath())); +TEST_F(TestAzuriteFileSystem, OpenInputStreamInfoInvalid) { + auto data = SetUpPreexistingData(); + ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(data.container_name + "/")); ASSERT_RAISES(IOError, fs_->OpenInputStream(info)); - ASSERT_OK_AND_ASSIGN(auto info2, fs_->GetFileInfo(NotFoundObjectPath())); + ASSERT_OK_AND_ASSIGN(auto info2, fs_->GetFileInfo(data.NotFoundObjectPath())); ASSERT_RAISES(IOError, fs_->OpenInputStream(info2)); } -TEST_F(AzuriteFileSystemTest, OpenInputStreamUri) { - ASSERT_RAISES(Invalid, fs_->OpenInputStream("abfs://" + PreexistingObjectPath())); +TEST_F(TestAzuriteFileSystem, OpenInputStreamUri) { + auto data = SetUpPreexistingData(); + ASSERT_RAISES(Invalid, fs_->OpenInputStream("abfs://" + data.ObjectPath())); } -TEST_F(AzuriteFileSystemTest, OpenInputStreamTrailingSlash) { - ASSERT_RAISES(IOError, fs_->OpenInputStream(PreexistingObjectPath() + '/')); +TEST_F(TestAzuriteFileSystem, OpenInputStreamTrailingSlash) { + auto data = SetUpPreexistingData(); + ASSERT_RAISES(IOError, fs_->OpenInputStream(data.ObjectPath() + '/')); } namespace { @@ -1145,9 +1269,10 @@ std::shared_ptr NormalizerKeyValueMetadata( } }; // namespace -TEST_F(AzuriteFileSystemTest, OpenInputStreamReadMetadata) { +TEST_F(TestAzuriteFileSystem, OpenInputStreamReadMetadata) { + auto data = SetUpPreexistingData(); std::shared_ptr stream; - ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(PreexistingObjectPath())); + ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(data.ObjectPath())); std::shared_ptr actual; ASSERT_OK_AND_ASSIGN(actual, stream->ReadMetadata()); @@ -1175,8 +1300,9 @@ TEST_F(AzuriteFileSystemTest, OpenInputStreamReadMetadata) { NormalizerKeyValueMetadata(actual)->ToString()); } -TEST_F(AzuriteFileSystemTest, OpenInputStreamClosed) { - ASSERT_OK_AND_ASSIGN(auto stream, fs_->OpenInputStream(PreexistingObjectPath())); +TEST_F(TestAzuriteFileSystem, OpenInputStreamClosed) { + auto data = SetUpPreexistingData(); + ASSERT_OK_AND_ASSIGN(auto stream, fs_->OpenInputStream(data.ObjectPath())); ASSERT_OK(stream->Close()); std::array buffer{}; ASSERT_RAISES(Invalid, stream->Read(buffer.size(), buffer.data())); @@ -1184,44 +1310,45 @@ TEST_F(AzuriteFileSystemTest, OpenInputStreamClosed) { ASSERT_RAISES(Invalid, stream->Tell()); } -TEST_F(AzuriteFileSystemTest, TestWriteMetadata) { +TEST_F(TestAzuriteFileSystem, WriteMetadata) { + auto data = SetUpPreexistingData(); options_.default_metadata = arrow::key_value_metadata({{"foo", "bar"}}); ASSERT_OK_AND_ASSIGN(auto fs_with_defaults, AzureFileSystem::Make(options_)); - std::string path = "object_with_defaults"; - auto location = PreexistingContainerPath() + path; + std::string blob_path = "object_with_defaults"; + auto full_path = data.ContainerPath(blob_path); ASSERT_OK_AND_ASSIGN(auto output, - fs_with_defaults->OpenOutputStream(location, /*metadata=*/{})); - const std::string_view expected(kLoremIpsum); + fs_with_defaults->OpenOutputStream(full_path, /*metadata=*/{})); + const std::string_view expected(PreexistingData::kLoremIpsum); ASSERT_OK(output->Write(expected)); ASSERT_OK(output->Close()); // Verify the metadata has been set. - auto blob_metadata = - blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) - .GetBlockBlobClient(path) - .GetProperties() - .Value.Metadata; - EXPECT_EQ(Azure::Core::CaseInsensitiveMap{std::make_pair("foo", "bar")}, blob_metadata); + auto blob_metadata = blob_service_client_->GetBlobContainerClient(data.container_name) + .GetBlockBlobClient(blob_path) + .GetProperties() + .Value.Metadata; + EXPECT_EQ(Core::CaseInsensitiveMap{std::make_pair("foo", "bar")}, blob_metadata); // Check that explicit metadata overrides the defaults. ASSERT_OK_AND_ASSIGN( output, fs_with_defaults->OpenOutputStream( - location, /*metadata=*/arrow::key_value_metadata({{"bar", "foo"}}))); + full_path, /*metadata=*/arrow::key_value_metadata({{"bar", "foo"}}))); ASSERT_OK(output->Write(expected)); ASSERT_OK(output->Close()); - blob_metadata = blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) - .GetBlockBlobClient(path) + blob_metadata = blob_service_client_->GetBlobContainerClient(data.container_name) + .GetBlockBlobClient(blob_path) .GetProperties() .Value.Metadata; // Defaults are overwritten and not merged. - EXPECT_EQ(Azure::Core::CaseInsensitiveMap{std::make_pair("bar", "foo")}, blob_metadata); + EXPECT_EQ(Core::CaseInsensitiveMap{std::make_pair("bar", "foo")}, blob_metadata); } -TEST_F(AzuriteFileSystemTest, OpenOutputStreamSmall) { - const auto path = PreexistingContainerPath() + "test-write-object"; +TEST_F(TestAzuriteFileSystem, OpenOutputStreamSmall) { + auto data = SetUpPreexistingData(); + const auto path = data.ContainerPath("test-write-object"); ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(path, {})); - const std::string_view expected(kLoremIpsum); + const std::string_view expected(PreexistingData::kLoremIpsum); ASSERT_OK(output->Write(expected)); ASSERT_OK(output->Close()); @@ -1234,8 +1361,9 @@ TEST_F(AzuriteFileSystemTest, OpenOutputStreamSmall) { EXPECT_EQ(expected, std::string_view(inbuf.data(), size)); } -TEST_F(AzuriteFileSystemTest, OpenOutputStreamLarge) { - const auto path = PreexistingContainerPath() + "test-write-object"; +TEST_F(TestAzuriteFileSystem, OpenOutputStreamLarge) { + auto data = SetUpPreexistingData(); + const auto path = data.ContainerPath("test-write-object"); ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(path, {})); std::array sizes{257 * 1024, 258 * 1024, 259 * 1024}; std::array buffers{ @@ -1265,8 +1393,9 @@ TEST_F(AzuriteFileSystemTest, OpenOutputStreamLarge) { EXPECT_EQ(contents, buffers[0] + buffers[1] + buffers[2]); } -TEST_F(AzuriteFileSystemTest, OpenOutputStreamTruncatesExistingFile) { - const auto path = PreexistingContainerPath() + "test-write-object"; +TEST_F(TestAzuriteFileSystem, OpenOutputStreamTruncatesExistingFile) { + auto data = SetUpPreexistingData(); + const auto path = data.ContainerPath("test-write-object"); ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(path, {})); const std::string_view expected0("Existing blob content"); ASSERT_OK(output->Write(expected0)); @@ -1281,7 +1410,7 @@ TEST_F(AzuriteFileSystemTest, OpenOutputStreamTruncatesExistingFile) { EXPECT_EQ(expected0, std::string_view(inbuf.data(), size)); ASSERT_OK_AND_ASSIGN(output, fs_->OpenOutputStream(path, {})); - const std::string_view expected1(kLoremIpsum); + const std::string_view expected1(PreexistingData::kLoremIpsum); ASSERT_OK(output->Write(expected1)); ASSERT_OK(output->Close()); @@ -1291,8 +1420,9 @@ TEST_F(AzuriteFileSystemTest, OpenOutputStreamTruncatesExistingFile) { EXPECT_EQ(expected1, std::string_view(inbuf.data(), size)); } -TEST_F(AzuriteFileSystemTest, OpenAppendStreamDoesNotTruncateExistingFile) { - const auto path = PreexistingContainerPath() + "test-write-object"; +TEST_F(TestAzuriteFileSystem, OpenAppendStreamDoesNotTruncateExistingFile) { + auto data = SetUpPreexistingData(); + const auto path = data.ContainerPath("test-write-object"); ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(path, {})); const std::string_view expected0("Existing blob content"); ASSERT_OK(output->Write(expected0)); @@ -1307,7 +1437,7 @@ TEST_F(AzuriteFileSystemTest, OpenAppendStreamDoesNotTruncateExistingFile) { EXPECT_EQ(expected0, std::string_view(inbuf.data())); ASSERT_OK_AND_ASSIGN(output, fs_->OpenAppendStream(path, {})); - const std::string_view expected1(kLoremIpsum); + const std::string_view expected1(PreexistingData::kLoremIpsum); ASSERT_OK(output->Write(expected1)); ASSERT_OK(output->Close()); @@ -1319,35 +1449,37 @@ TEST_F(AzuriteFileSystemTest, OpenAppendStreamDoesNotTruncateExistingFile) { std::string(expected0) + std::string(expected1)); } -TEST_F(AzuriteFileSystemTest, OpenOutputStreamClosed) { - const auto path = internal::ConcatAbstractPath(PreexistingContainerName(), - "open-output-stream-closed.txt"); +TEST_F(TestAzuriteFileSystem, OpenOutputStreamClosed) { + auto data = SetUpPreexistingData(); + const auto path = data.ContainerPath("open-output-stream-closed.txt"); ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(path, {})); ASSERT_OK(output->Close()); - ASSERT_RAISES(Invalid, output->Write(kLoremIpsum, std::strlen(kLoremIpsum))); + ASSERT_RAISES(Invalid, output->Write(PreexistingData::kLoremIpsum, + std::strlen(PreexistingData::kLoremIpsum))); ASSERT_RAISES(Invalid, output->Flush()); ASSERT_RAISES(Invalid, output->Tell()); } -TEST_F(AzuriteFileSystemTest, OpenOutputStreamUri) { - const auto path = internal::ConcatAbstractPath(PreexistingContainerName(), - "open-output-stream-uri.txt"); +TEST_F(TestAzuriteFileSystem, OpenOutputStreamUri) { + auto data = SetUpPreexistingData(); + const auto path = data.ContainerPath("open-output-stream-uri.txt"); ASSERT_RAISES(Invalid, fs_->OpenInputStream("abfs://" + path)); } -TEST_F(AzuriteFileSystemTest, OpenInputFileMixedReadVsReadAt) { +TEST_F(TestAzuriteFileSystem, OpenInputFileMixedReadVsReadAt) { + auto data = SetUpPreexistingData(); // Create a file large enough to make the random access tests non-trivial. auto constexpr kLineWidth = 100; auto constexpr kLineCount = 4096; std::vector lines(kLineCount); int lineno = 0; - std::generate_n(lines.begin(), lines.size(), - [&] { return RandomLine(++lineno, kLineWidth); }); + std::generate_n(lines.begin(), lines.size(), [&] { + return PreexistingData::RandomLine(++lineno, kLineWidth, rng_); + }); - const auto path_to_file = "OpenInputFileMixedReadVsReadAt/object-name"; - const auto path = PreexistingContainerPath() + path_to_file; + const auto path = data.ContainerPath("OpenInputFileMixedReadVsReadAt/object-name"); - UploadLines(lines, path_to_file, kLineCount * kLineWidth); + UploadLines(lines, path, kLineCount * kLineWidth); std::shared_ptr file; ASSERT_OK_AND_ASSIGN(file, fs_->OpenInputFile(path)); @@ -1368,7 +1500,7 @@ TEST_F(AzuriteFileSystemTest, OpenInputFileMixedReadVsReadAt) { } // Verify random reads interleave too. - auto const index = RandomIndex(kLineCount); + auto const index = PreexistingData::RandomIndex(kLineCount, rng_); auto const position = index * kLineWidth; ASSERT_OK_AND_ASSIGN(size, file->ReadAt(position, buffer.size(), buffer.data())); EXPECT_EQ(size, kLineWidth); @@ -1381,27 +1513,28 @@ TEST_F(AzuriteFileSystemTest, OpenInputFileMixedReadVsReadAt) { } } -TEST_F(AzuriteFileSystemTest, OpenInputFileRandomSeek) { +TEST_F(TestAzuriteFileSystem, OpenInputFileRandomSeek) { + auto data = SetUpPreexistingData(); // Create a file large enough to make the random access tests non-trivial. auto constexpr kLineWidth = 100; auto constexpr kLineCount = 4096; std::vector lines(kLineCount); int lineno = 0; - std::generate_n(lines.begin(), lines.size(), - [&] { return RandomLine(++lineno, kLineWidth); }); + std::generate_n(lines.begin(), lines.size(), [&] { + return PreexistingData::RandomLine(++lineno, kLineWidth, rng_); + }); - const auto path_to_file = "OpenInputFileRandomSeek/object-name"; - const auto path = PreexistingContainerPath() + path_to_file; + const auto path = data.ContainerPath("OpenInputFileRandomSeek/object-name"); std::shared_ptr output; - UploadLines(lines, path_to_file, kLineCount * kLineWidth); + UploadLines(lines, path, kLineCount * kLineWidth); std::shared_ptr file; ASSERT_OK_AND_ASSIGN(file, fs_->OpenInputFile(path)); for (int i = 0; i != 32; ++i) { SCOPED_TRACE("Iteration " + std::to_string(i)); // Verify sequential reads work as expected. - auto const index = RandomIndex(kLineCount); + auto const index = PreexistingData::RandomIndex(kLineCount, rng_); auto const position = index * kLineWidth; ASSERT_OK(file->Seek(position)); ASSERT_OK_AND_ASSIGN(auto actual, file->Read(kLineWidth)); @@ -1409,15 +1542,15 @@ TEST_F(AzuriteFileSystemTest, OpenInputFileRandomSeek) { } } -TEST_F(AzuriteFileSystemTest, OpenInputFileIoContext) { +TEST_F(TestAzuriteFileSystem, OpenInputFileIoContext) { + auto data = SetUpPreexistingData(); // Create a test file. - const auto path_to_file = "OpenInputFileIoContext/object-name"; - const auto path = PreexistingContainerPath() + path_to_file; + const auto blob_path = "OpenInputFileIoContext/object-name"; + const auto path = data.ContainerPath(blob_path); const std::string contents = "The quick brown fox jumps over the lazy dog"; - auto blob_client = - blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) - .GetBlockBlobClient(path_to_file); + auto blob_client = blob_service_client_->GetBlobContainerClient(data.container_name) + .GetBlockBlobClient(blob_path); blob_client.UploadFrom(reinterpret_cast(contents.data()), contents.length()); @@ -1426,8 +1559,9 @@ TEST_F(AzuriteFileSystemTest, OpenInputFileIoContext) { EXPECT_EQ(fs_->io_context().external_id(), file->io_context().external_id()); } -TEST_F(AzuriteFileSystemTest, OpenInputFileInfo) { - ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(PreexistingObjectPath())); +TEST_F(TestAzuriteFileSystem, OpenInputFileInfo) { + auto data = SetUpPreexistingData(); + ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(data.ObjectPath())); std::shared_ptr file; ASSERT_OK_AND_ASSIGN(file, fs_->OpenInputFile(info)); @@ -1437,24 +1571,27 @@ TEST_F(AzuriteFileSystemTest, OpenInputFileInfo) { auto constexpr kStart = 16; ASSERT_OK_AND_ASSIGN(size, file->ReadAt(kStart, buffer.size(), buffer.data())); - auto const expected = std::string(kLoremIpsum).substr(kStart); + auto const expected = std::string(PreexistingData::kLoremIpsum).substr(kStart); EXPECT_EQ(std::string(buffer.data(), size), expected); } -TEST_F(AzuriteFileSystemTest, OpenInputFileNotFound) { - ASSERT_RAISES(IOError, fs_->OpenInputFile(NotFoundObjectPath())); +TEST_F(TestAzuriteFileSystem, OpenInputFileNotFound) { + auto data = SetUpPreexistingData(); + ASSERT_RAISES(IOError, fs_->OpenInputFile(data.NotFoundObjectPath())); } -TEST_F(AzuriteFileSystemTest, OpenInputFileInfoInvalid) { - ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(PreexistingContainerPath())); +TEST_F(TestAzuriteFileSystem, OpenInputFileInfoInvalid) { + auto data = SetUpPreexistingData(); + ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(data.container_name)); ASSERT_RAISES(IOError, fs_->OpenInputFile(info)); - ASSERT_OK_AND_ASSIGN(auto info2, fs_->GetFileInfo(NotFoundObjectPath())); + ASSERT_OK_AND_ASSIGN(auto info2, fs_->GetFileInfo(data.NotFoundObjectPath())); ASSERT_RAISES(IOError, fs_->OpenInputFile(info2)); } -TEST_F(AzuriteFileSystemTest, OpenInputFileClosed) { - ASSERT_OK_AND_ASSIGN(auto stream, fs_->OpenInputFile(PreexistingObjectPath())); +TEST_F(TestAzuriteFileSystem, OpenInputFileClosed) { + auto data = SetUpPreexistingData(); + ASSERT_OK_AND_ASSIGN(auto stream, fs_->OpenInputFile(data.ObjectPath())); ASSERT_OK(stream->Close()); std::array buffer{}; ASSERT_RAISES(Invalid, stream->Tell()); From 431c4ea4d9facb23c612631317a2e1f862087ba7 Mon Sep 17 00:00:00 2001 From: Curt Hagenlocher Date: Thu, 14 Dec 2023 12:08:55 -0800 Subject: [PATCH 048/570] GH-39223: [C#] Support IReadOnlyList on remaining scalar types (#39224) ### What changes are included in this PR? Decimal128Array implements IReadOnlyList and IReadOnlyList. Decimal256Array implements IReadOnlyList, IReadOnlyList and IReadOnlyList. FixedLengthBinaryArray implements IReadOnlyList. DurationArray implements IReadOnlyList. Also removes #ifs which are no longer relevant now that netstandard13 isn't being built any more. ### Are these changes tested? Yes. * Closes: #39223 Authored-by: Curt Hagenlocher Signed-off-by: Curt Hagenlocher --- .../Apache.Arrow/Arrays/Decimal128Array.cs | 23 ++++++---- .../Apache.Arrow/Arrays/Decimal256Array.cs | 44 ++++++++++++++++--- .../src/Apache.Arrow/Arrays/DurationArray.cs | 17 ++++++- .../Arrays/FixedSizeBinaryArray.cs | 17 ++++++- csharp/src/Apache.Arrow/DecimalUtility.cs | 6 --- .../Decimal128ArrayTests.cs | 25 +++-------- .../Decimal256ArrayTests.cs | 36 +++++++-------- .../Apache.Arrow.Tests/DecimalUtilityTests.cs | 5 --- .../Apache.Arrow.Tests/DurationArrayTests.cs | 4 ++ 9 files changed, 112 insertions(+), 65 deletions(-) diff --git a/csharp/src/Apache.Arrow/Arrays/Decimal128Array.cs b/csharp/src/Apache.Arrow/Arrays/Decimal128Array.cs index 0e3ec56740449..5a51175b7c4da 100644 --- a/csharp/src/Apache.Arrow/Arrays/Decimal128Array.cs +++ b/csharp/src/Apache.Arrow/Arrays/Decimal128Array.cs @@ -14,18 +14,16 @@ // limitations under the License. using System; +using System.Collections; using System.Collections.Generic; -#if !NETSTANDARD1_3 using System.Data.SqlTypes; -#endif using System.Diagnostics; -using System.Numerics; using Apache.Arrow.Arrays; using Apache.Arrow.Types; namespace Apache.Arrow { - public class Decimal128Array : FixedSizeBinaryArray + public class Decimal128Array : FixedSizeBinaryArray, IReadOnlyList { public class Builder : BuilderBase { @@ -95,7 +93,6 @@ public Builder AppendRange(IEnumerable values) return Instance; } -#if !NETSTANDARD1_3 public Builder Append(SqlDecimal value) { Span bytes = stackalloc byte[DataType.ByteWidth]; @@ -118,7 +115,6 @@ public Builder AppendRange(IEnumerable values) return Instance; } -#endif public Builder Set(int index, decimal value) { @@ -184,7 +180,6 @@ public string GetString(int index) return DecimalUtility.GetString(ValueBuffer, index, Precision, Scale, ByteWidth); } -#if !NETSTANDARD1_3 public SqlDecimal? GetSqlDecimal(int index) { if (IsNull(index)) @@ -194,6 +189,18 @@ public string GetString(int index) return DecimalUtility.GetSqlDecimal128(ValueBuffer, index, Precision, Scale); } -#endif + + int IReadOnlyCollection.Count => Length; + SqlDecimal? IReadOnlyList.this[int index] => GetSqlDecimal(index); + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + yield return GetSqlDecimal(index); + } + } + + IEnumerator IEnumerable.GetEnumerator() => ((IEnumerable)this).GetEnumerator(); } } diff --git a/csharp/src/Apache.Arrow/Arrays/Decimal256Array.cs b/csharp/src/Apache.Arrow/Arrays/Decimal256Array.cs index 94a47f258280e..eca2611b6f3bb 100644 --- a/csharp/src/Apache.Arrow/Arrays/Decimal256Array.cs +++ b/csharp/src/Apache.Arrow/Arrays/Decimal256Array.cs @@ -14,17 +14,16 @@ // limitations under the License. using System; +using System.Collections; using System.Collections.Generic; -#if !NETSTANDARD1_3 using System.Data.SqlTypes; -#endif using System.Diagnostics; using Apache.Arrow.Arrays; using Apache.Arrow.Types; namespace Apache.Arrow { - public class Decimal256Array : FixedSizeBinaryArray + public class Decimal256Array : FixedSizeBinaryArray, IReadOnlyList, IReadOnlyList { public class Builder : BuilderBase { @@ -94,7 +93,6 @@ public Builder AppendRange(IEnumerable values) return Instance; } -#if !NETSTANDARD1_3 public Builder Append(SqlDecimal value) { Span bytes = stackalloc byte[DataType.ByteWidth]; @@ -123,7 +121,6 @@ public Builder AppendRange(IEnumerable values) return Instance; } -#endif public Builder Set(int index, decimal value) { @@ -190,7 +187,6 @@ public string GetString(int index) return DecimalUtility.GetString(ValueBuffer, index, Precision, Scale, ByteWidth); } -#if !NETSTANDARD1_3 public bool TryGetSqlDecimal(int index, out SqlDecimal? value) { if (IsNull(index)) @@ -211,6 +207,40 @@ public bool TryGetSqlDecimal(int index, out SqlDecimal? value) value = null; return false; } -#endif + + private SqlDecimal? GetSqlDecimal(int index) + { + SqlDecimal? value; + if (TryGetSqlDecimal(index, out value)) + { + return value; + } + + throw new OverflowException("decimal256 value out of range of SqlDecimal"); + } + + int IReadOnlyCollection.Count => Length; + SqlDecimal? IReadOnlyList.this[int index] => GetSqlDecimal(index); + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + yield return GetSqlDecimal(index); + } + } + + int IReadOnlyCollection.Count => Length; + string? IReadOnlyList.this[int index] => GetString(index); + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + yield return GetString(index); + } + } + + IEnumerator IEnumerable.GetEnumerator() => ((IEnumerable)this).GetEnumerator(); } } diff --git a/csharp/src/Apache.Arrow/Arrays/DurationArray.cs b/csharp/src/Apache.Arrow/Arrays/DurationArray.cs index 3649dda50cd97..f725a71e377ab 100644 --- a/csharp/src/Apache.Arrow/Arrays/DurationArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/DurationArray.cs @@ -14,11 +14,13 @@ // limitations under the License. using System; +using System.Collections; +using System.Collections.Generic; using Apache.Arrow.Types; namespace Apache.Arrow { - public class DurationArray : PrimitiveArray + public class DurationArray : PrimitiveArray, IReadOnlyList { public class Builder : PrimitiveArrayBuilder { @@ -80,5 +82,18 @@ public DurationArray(ArrayData data) } public override void Accept(IArrowArrayVisitor visitor) => Accept(this, visitor); + + int IReadOnlyCollection.Count => Length; + TimeSpan? IReadOnlyList.this[int index] => GetTimeSpan(index); + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + yield return GetTimeSpan(index); + } + } + + IEnumerator IEnumerable.GetEnumerator() => ((IEnumerable)this).GetEnumerator(); } } diff --git a/csharp/src/Apache.Arrow/Arrays/FixedSizeBinaryArray.cs b/csharp/src/Apache.Arrow/Arrays/FixedSizeBinaryArray.cs index 866a674bc9df8..0fa7954724f38 100644 --- a/csharp/src/Apache.Arrow/Arrays/FixedSizeBinaryArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/FixedSizeBinaryArray.cs @@ -14,13 +14,14 @@ // limitations under the License. using System; +using System.Collections; using System.Collections.Generic; using Apache.Arrow.Memory; using Apache.Arrow.Types; namespace Apache.Arrow.Arrays { - public class FixedSizeBinaryArray : Array + public class FixedSizeBinaryArray : Array, IReadOnlyList { public FixedSizeBinaryArray(ArrayData data) : base(data) @@ -70,6 +71,19 @@ public ReadOnlySpan GetBytes(int index) return ValueBuffer.Span.Slice(index * size, size); } + int IReadOnlyCollection.Count => Length; + byte[] IReadOnlyList.this[int index] => GetBytes(index).ToArray(); + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + yield return GetBytes(index).ToArray(); + } + } + + IEnumerator IEnumerable.GetEnumerator() => ((IEnumerable)this).GetEnumerator(); + public abstract class BuilderBase : IArrowArrayBuilder where TArray : IArrowArray where TBuilder : class, IArrowArrayBuilder @@ -220,7 +234,6 @@ public TBuilder SetNull(int index) ValidityBuffer.Set(index, false); return Instance; } - } } } diff --git a/csharp/src/Apache.Arrow/DecimalUtility.cs b/csharp/src/Apache.Arrow/DecimalUtility.cs index bb3f0834fcec3..e2ab18d479edb 100644 --- a/csharp/src/Apache.Arrow/DecimalUtility.cs +++ b/csharp/src/Apache.Arrow/DecimalUtility.cs @@ -14,9 +14,7 @@ // limitations under the License. using System; -#if !NETSTANDARD1_3 using System.Data.SqlTypes; -#endif using System.Numerics; namespace Apache.Arrow @@ -183,7 +181,6 @@ internal unsafe static string GetString(in ArrowBuffer valueBuffer, int index, i } #endif -#if !NETSTANDARD1_3 internal static SqlDecimal GetSqlDecimal128(in ArrowBuffer valueBuffer, int index, int precision, int scale) { const int byteWidth = 16; @@ -207,7 +204,6 @@ internal static SqlDecimal GetSqlDecimal128(in ArrowBuffer valueBuffer, int inde return new SqlDecimal((byte)precision, (byte)scale, false, (int)(data1 & 0xffffffff), (int)(data1 >> 32), (int)(data2 & 0xffffffff), (int)(data2 >> 32)); } } -#endif private static decimal DivideByScale(BigInteger integerValue, int scale) { @@ -428,7 +424,6 @@ internal static void GetBytes(string value, int precision, int scale, int byteWi } } -#if !NETSTANDARD1_3 internal static void GetBytes(SqlDecimal value, int precision, int scale, Span bytes) { if (value.Precision != precision || value.Scale != scale) @@ -446,6 +441,5 @@ internal static void GetBytes(SqlDecimal value, int precision, int scale, Span asList = array; + for (int i = 0; i < asList.Count; i++) + { + Assert.Equal(testData[i], asList[i]); + } } [Fact] @@ -467,7 +457,6 @@ public void AppendRangeSqlDecimal() Assert.Null(array.GetValue(range.Length)); } } -#endif } } } diff --git a/csharp/test/Apache.Arrow.Tests/Decimal256ArrayTests.cs b/csharp/test/Apache.Arrow.Tests/Decimal256ArrayTests.cs index 3924c73a4e2f7..baeb7ee5419b9 100644 --- a/csharp/test/Apache.Arrow.Tests/Decimal256ArrayTests.cs +++ b/csharp/test/Apache.Arrow.Tests/Decimal256ArrayTests.cs @@ -14,10 +14,9 @@ // limitations under the License. using System; -#if !NETSTANDARD1_3 +using System.Collections.Generic; using System.Data.SqlTypes; using System.Linq; -#endif using Apache.Arrow.Types; using Xunit; @@ -25,7 +24,6 @@ namespace Apache.Arrow.Tests { public class Decimal256ArrayTests { -#if !NETSTANDARD1_3 static SqlDecimal? GetSqlDecimal(Decimal256Array array, int index) { SqlDecimal? result; @@ -42,7 +40,11 @@ public class Decimal256ArrayTests { return value == null ? null : value.Value.Value; } -#endif + + static decimal? Convert(string value) + { + return value == null ? null : decimal.Parse(value); + } public class Builder { @@ -68,11 +70,9 @@ public void AppendThenGetGivesNull() Assert.Null(array.GetValue(1)); Assert.Null(array.GetValue(2)); -#if !NETSTANDARD1_3 Assert.Null(GetSqlDecimal(array, 0)); Assert.Null(GetSqlDecimal(array, 1)); Assert.Null(GetSqlDecimal(array, 2)); -#endif } } @@ -106,9 +106,7 @@ public void AppendDecimal(int count) for (int i = 0; i < count; i++) { Assert.Equal(testData[i], array.GetValue(i)); -#if !NETSTANDARD1_3 Assert.Equal(Convert(testData[i]), GetSqlDecimal(array, i)); -#endif } } @@ -127,10 +125,8 @@ public void AppendLargeDecimal() Assert.Equal(large, array.GetValue(0)); Assert.Equal(-large, array.GetValue(1)); -#if !NETSTANDARD1_3 Assert.Equal(Convert(large), GetSqlDecimal(array, 0)); Assert.Equal(Convert(-large), GetSqlDecimal(array, 1)); -#endif } [Fact] @@ -152,12 +148,10 @@ public void AppendMaxAndMinDecimal() Assert.Equal(Decimal.MaxValue - 10, array.GetValue(2)); Assert.Equal(Decimal.MinValue + 10, array.GetValue(3)); -#if !NETSTANDARD1_3 Assert.Equal(Convert(Decimal.MaxValue), GetSqlDecimal(array, 0)); Assert.Equal(Convert(Decimal.MinValue), GetSqlDecimal(array, 1)); Assert.Equal(Convert(Decimal.MaxValue) - 10, GetSqlDecimal(array, 2)); Assert.Equal(Convert(Decimal.MinValue) + 10, GetSqlDecimal(array, 3)); -#endif } [Fact] @@ -175,10 +169,8 @@ public void AppendFractionalDecimal() Assert.Equal(fraction, array.GetValue(0)); Assert.Equal(-fraction, array.GetValue(1)); -#if !NETSTANDARD1_3 Assert.Equal(Convert(fraction), GetSqlDecimal(array, 0)); Assert.Equal(Convert(-fraction), GetSqlDecimal(array, 1)); -#endif } [Fact] @@ -197,9 +189,7 @@ public void AppendRangeDecimal() for(int i = 0; i < range.Length; i ++) { Assert.Equal(range[i], array.GetValue(i)); -#if !NETSTANDARD1_3 Assert.Equal(Convert(range[i]), GetSqlDecimal(array, i)); -#endif } Assert.Null( array.GetValue(range.Length)); @@ -308,7 +298,6 @@ public void SwapNull() } } -#if !NETSTANDARD1_3 public class SqlDecimals { [Theory] @@ -342,6 +331,18 @@ public void AppendSqlDecimal(int count) Assert.Equal(testData[i], GetSqlDecimal(array, i)); Assert.Equal(Convert(testData[i]), array.GetValue(i)); } + + IReadOnlyList asDecimalList = array; + for (int i = 0; i < asDecimalList.Count; i++) + { + Assert.Equal(testData[i], asDecimalList[i]); + } + + IReadOnlyList asStringList = array; + for (int i = 0; i < asStringList.Count; i++) + { + Assert.Equal(Convert(testData[i]?.ToString()), Convert(asStringList[i])); + } } [Fact] @@ -474,7 +475,6 @@ public void AppendRangeSqlDecimal() Assert.Null(array.GetValue(range.Length)); } } -#endif } } } diff --git a/csharp/test/Apache.Arrow.Tests/DecimalUtilityTests.cs b/csharp/test/Apache.Arrow.Tests/DecimalUtilityTests.cs index 677e9b6cadfcf..1156ecb452c94 100644 --- a/csharp/test/Apache.Arrow.Tests/DecimalUtilityTests.cs +++ b/csharp/test/Apache.Arrow.Tests/DecimalUtilityTests.cs @@ -14,9 +14,7 @@ // limitations under the License. using System; -#if !NETSTANDARD1_3 using System.Data.SqlTypes; -#endif using Apache.Arrow.Types; using Xunit; @@ -72,8 +70,6 @@ public void Decimal256HasExpectedResultOrThrows(decimal d, int precision, int sc public class SqlDecimals { - -#if !NETSTANDARD1_3 [Fact] public void NegativeSqlDecimal() { @@ -119,7 +115,6 @@ public void LargeScale() Assert.Equal(negative, sqlNegative); Assert.Equal(digits, sqlNegative.ToString()); } -#endif } public class Strings diff --git a/csharp/test/Apache.Arrow.Tests/DurationArrayTests.cs b/csharp/test/Apache.Arrow.Tests/DurationArrayTests.cs index 3395ca7bc9ad7..59080d739b10b 100644 --- a/csharp/test/Apache.Arrow.Tests/DurationArrayTests.cs +++ b/csharp/test/Apache.Arrow.Tests/DurationArrayTests.cs @@ -113,6 +113,10 @@ public void AppendTimeSpanGivesSameTimeSpan(TimeSpan? timeSpan, DurationType typ var array = builder.Build(); Assert.Equal(1, array.Length); Assert.Equal(timeSpan, array.GetTimeSpan(0)); + + IReadOnlyList asList = array; + Assert.Equal(1, asList.Count); + Assert.Equal(timeSpan, asList[0]); } } From 75c6b642b5ff1ed171bc1d1a758a70098539c48e Mon Sep 17 00:00:00 2001 From: Miguel Pragier Date: Fri, 15 Dec 2023 20:03:18 +0100 Subject: [PATCH 049/570] GH-39238:[Go] PATCH Prevents empty record to be appended to empty resultset (#39239) ### Rationale for this change When having an empty resultset, the driver tries to include an empty record referece, that cannot be scanned. So, any operation that relies on the returned Row(s) will trigger a "Index out of Range" error. ### What changes are included in this PR? We're preventing to include an invalid record (that can't be scanned) in an empty resultset ### Are these changes tested? Yes, there's a new test included ### Are there any user-facing changes? No **This PR contains a "Critical Fix".** * Closes: #39238 Authored-by: miguel pragier Signed-off-by: Matt Topol --- go/arrow/flight/flightsql/driver/driver.go | 7 +-- .../flight/flightsql/driver/driver_test.go | 44 +++++++++++++++++++ 2 files changed, 48 insertions(+), 3 deletions(-) diff --git a/go/arrow/flight/flightsql/driver/driver.go b/go/arrow/flight/flightsql/driver/driver.go index e31e572586557..f74bfa378a303 100644 --- a/go/arrow/flight/flightsql/driver/driver.go +++ b/go/arrow/flight/flightsql/driver/driver.go @@ -487,9 +487,10 @@ func readEndpoint(ctx context.Context, client *flightsql.Client, endpoint *fligh schema := reader.Schema() var records []arrow.Record for reader.Next() { - record := reader.Record() - record.Retain() - records = append(records, record) + if record := reader.Record(); record.NumRows() > 0 { + record.Retain() + records = append(records, record) + } } if err := reader.Err(); err != nil && !errors.Is(err, io.EOF) { diff --git a/go/arrow/flight/flightsql/driver/driver_test.go b/go/arrow/flight/flightsql/driver/driver_test.go index a388bf155ec99..24eb5ee6812c0 100644 --- a/go/arrow/flight/flightsql/driver/driver_test.go +++ b/go/arrow/flight/flightsql/driver/driver_test.go @@ -273,6 +273,50 @@ func (s *SqlTestSuite) TestQuery() { wg.Wait() } +func (s *SqlTestSuite) TestQueryWithEmptyResultset() { + t := s.T() + + // Create and start the server + server, addr, err := s.createServer() + require.NoError(t, err) + + var wg sync.WaitGroup + wg.Add(1) + go func() { + defer wg.Done() + require.NoError(s.T(), s.startServer(server)) + }() + defer s.stopServer(server) + time.Sleep(100 * time.Millisecond) + + // Configure client + cfg := s.Config + cfg.Address = addr + db, err := sql.Open("flightsql", cfg.DSN()) + require.NoError(t, err) + defer db.Close() + + // Create the table + _, err = db.Exec(fmt.Sprintf(s.Statements["create table"], s.TableName)) + require.NoError(t, err) + + rows, err := db.Query(fmt.Sprintf(s.Statements["query"], s.TableName)) + require.NoError(t, err) + require.False(t, rows.Next()) + + row := db.QueryRow(fmt.Sprintf(s.Statements["query"], s.TableName)) + require.NotNil(t, row) + require.NoError(t, row.Err()) + + target := make(map[string]any) + err = row.Scan(&target) + require.ErrorIs(t, err, sql.ErrNoRows) + + // Tear-down server + s.stopServer(server) + wg.Wait() +} + func (s *SqlTestSuite) TestPreparedQuery() { t := s.T() From 132b1f71ab9a4993557fb79e89824418e5e3618b Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sat, 16 Dec 2023 23:52:43 +0900 Subject: [PATCH 050/570] GH-39246: [CI][GLib][Ruby] Use Ubuntu 22.04 not 20.04 (#39247) ### Rationale for this change Ubuntu 20.04 ships Ruby 2.7 but it reached EOL. Bundler 2.5.0 or later requires Ruby 3.0 or later. ### What changes are included in this PR? Use Ubuntu 22.04 that ships Ruby 3.0. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * Closes: #39246 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- .github/workflows/ruby.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml index 25edec62e06eb..be30865ac7ac6 100644 --- a/.github/workflows/ruby.yml +++ b/.github/workflows/ruby.yml @@ -66,7 +66,7 @@ jobs: fail-fast: false matrix: ubuntu: - - 20.04 + - 22.04 env: UBUNTU: ${{ matrix.ubuntu }} steps: From 49fde2313bc429547bc7e13886ed28c9c7fc6a84 Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Sat, 16 Dec 2023 11:10:04 -0500 Subject: [PATCH 051/570] GH-15060: [JS] Add LargeUtf8 type (#35780) This pull request adds support for the LargeUTF8 type in Arrow. Now we can create, decode, and encode these vectors. However, while the offset vectors support 64 bit integers, note that the value buffers are limited to a length of 32 bits meaning that LargeUTF8 vectors cannot yet be larger than UTF8 vectors. We will see how we can address this limitation in a follow up pull request. The issue is that JS typed arrays can be at most 2**31-1 elements long (implementation defined). This pull request also fixes a bug in a rounding method which prevented us from supporting large vectors so it's already a big step forward. Fixes #15060. * Closes: #15060 --------- Co-authored-by: Kyle Barron --- docs/source/status.rst | 2 +- js/src/Arrow.dom.ts | 4 +- js/src/Arrow.ts | 3 +- js/src/builder.ts | 27 +++++----- js/src/builder/buffer.ts | 52 ++++++++---------- js/src/builder/largeutf8.ts | 59 ++++++++++++++++++++ js/src/builder/list.ts | 4 +- js/src/data.ts | 18 +++++-- js/src/enum.ts | 6 +-- js/src/interfaces.ts | 22 +++++--- js/src/ipc/metadata/json.ts | 3 +- js/src/ipc/metadata/message.ts | 3 +- js/src/type.ts | 35 +++++++++--- js/src/util/buffer.ts | 10 ++-- js/src/visitor.ts | 4 ++ js/src/visitor/builderctor.ts | 2 + js/src/visitor/bytelength.ts | 3 +- js/src/visitor/get.ts | 19 ++++++- js/src/visitor/indexof.ts | 4 +- js/src/visitor/iterator.ts | 4 +- js/src/visitor/jsontypeassembler.ts | 5 +- js/src/visitor/jsonvectorassembler.ts | 8 ++- js/src/visitor/set.ts | 23 ++++++-- js/src/visitor/typeassembler.ts | 5 ++ js/src/visitor/typecomparator.ts | 4 +- js/src/visitor/typector.ts | 1 + js/src/visitor/vectorassembler.ts | 26 ++++++++- js/src/visitor/vectorloader.ts | 7 ++- js/test/data/tables.ts | 2 +- js/test/generate-test-data.ts | 60 +++++++++++++++++---- js/test/unit/builders/builder-tests.ts | 1 + js/test/unit/builders/largeUtf8-tests.ts | 65 +++++++++++++++++++++++ js/test/unit/generated-data-tests.ts | 1 + js/test/unit/generated-data-validators.ts | 20 +++++-- js/test/unit/vector/vector-tests.ts | 24 ++++++++- js/test/unit/visitor-tests.ts | 6 ++- 36 files changed, 432 insertions(+), 110 deletions(-) create mode 100644 js/src/builder/largeutf8.ts create mode 100644 js/test/unit/builders/largeUtf8-tests.ts diff --git a/docs/source/status.rst b/docs/source/status.rst index b8ee7eedbf284..e52e4e4cd49bc 100644 --- a/docs/source/status.rst +++ b/docs/source/status.rst @@ -66,7 +66,7 @@ Data Types +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Utf8 | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| Large Utf8 | ✓ | ✓ | ✓ | | | ✓ | ✓ | | +| Large Utf8 | ✓ | ✓ | ✓ | ✓ | | ✓ | ✓ | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Binary View | ✓ | | ✓ | | | | | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ diff --git a/js/src/Arrow.dom.ts b/js/src/Arrow.dom.ts index 451bf6acb6186..9ec76fdd009f3 100644 --- a/js/src/Arrow.dom.ts +++ b/js/src/Arrow.dom.ts @@ -47,7 +47,7 @@ export { Bool, Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64, Float, Float16, Float32, Float64, - Utf8, + Utf8, LargeUtf8, Binary, FixedSizeBinary, Date_, DateDay, DateMillisecond, @@ -96,5 +96,5 @@ export { TimestampBuilder, TimestampSecondBuilder, TimestampMillisecondBuilder, TimestampMicrosecondBuilder, TimestampNanosecondBuilder, TimeBuilder, TimeSecondBuilder, TimeMillisecondBuilder, TimeMicrosecondBuilder, TimeNanosecondBuilder, UnionBuilder, DenseUnionBuilder, SparseUnionBuilder, - Utf8Builder, + Utf8Builder, LargeUtf8Builder } from './Arrow.js'; diff --git a/js/src/Arrow.ts b/js/src/Arrow.ts index 714861e764ccb..b7e5f63a6ab5a 100644 --- a/js/src/Arrow.ts +++ b/js/src/Arrow.ts @@ -36,7 +36,7 @@ export { Bool, Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64, Float, Float16, Float32, Float64, - Utf8, + Utf8, LargeUtf8, Binary, FixedSizeBinary, Date_, DateDay, DateMillisecond, @@ -78,6 +78,7 @@ export { TimestampBuilder, TimestampSecondBuilder, TimestampMillisecondBuilder, export { IntervalBuilder, IntervalDayTimeBuilder, IntervalYearMonthBuilder } from './builder/interval.js'; export { DurationBuilder, DurationSecondBuilder, DurationMillisecondBuilder, DurationMicrosecondBuilder, DurationNanosecondBuilder } from './builder/duration.js'; export { Utf8Builder } from './builder/utf8.js'; +export { LargeUtf8Builder } from './builder/largeutf8.js'; export { BinaryBuilder } from './builder/binary.js'; export { ListBuilder } from './builder/list.js'; export { FixedSizeListBuilder } from './builder/fixedsizelist.js'; diff --git a/js/src/builder.ts b/js/src/builder.ts index 93510eedf84ff..1a4c52f871bbf 100644 --- a/js/src/builder.ts +++ b/js/src/builder.ts @@ -22,7 +22,7 @@ import { DataType, strideForType, Float, Int, Decimal, FixedSizeBinary, Date_, Time, Timestamp, Interval, Duration, - Utf8, Binary, List, Map_, + Utf8, LargeUtf8, Binary, List, Map_, } from './type.js'; import { createIsValidFunction } from './builder/valid.js'; import { BufferBuilder, BitmapBufferBuilder, DataBufferBuilder, OffsetsBufferBuilder } from './builder/buffer.js'; @@ -198,10 +198,10 @@ export abstract class Builder { return this.children.reduce((size, child) => size + child.reservedByteLength, size); } - declare protected _offsets: DataBufferBuilder; + declare protected _offsets: DataBufferBuilder; public get valueOffsets() { return this._offsets ? this._offsets.buffer : null; } - declare protected _values: BufferBuilder; + declare protected _values: BufferBuilder; public get values() { return this._values ? this._values.buffer : null; } declare protected _nulls: BitmapBufferBuilder; @@ -277,18 +277,15 @@ export abstract class Builder { * @returns A `Data` of the buffers and children representing the values written. */ public flush(): Data { - - let data; - let typeIds; - let nullBitmap; - let valueOffsets; + let data: BufferBuilder | undefined; + let typeIds: Int8Array; + let nullBitmap: Uint8Array | undefined; + let valueOffsets: T['TOffsetArray']; const { type, length, nullCount, _typeIds, _offsets, _values, _nulls } = this; - if (typeIds = _typeIds?.flush(length)) { // Unions - // DenseUnions + if (typeIds = _typeIds?.flush(length)) { // Unions, DenseUnions valueOffsets = _offsets?.flush(length); - } else if (valueOffsets = _offsets?.flush(length)) { // Variable-width primitives (Binary, Utf8), and Lists - // Binary, Utf8 + } else if (valueOffsets = _offsets?.flush(length)) { // Variable-width primitives (Binary, Utf8, LargeUtf8), and Lists data = _values?.flush(_offsets.last()); } else { // Fixed-width primitives (Int, Float, Decimal, Time, Timestamp, Duration and Interval) data = _values?.flush(length); @@ -355,13 +352,13 @@ export abstract class FixedWidthBuilder extends Builder { +export abstract class VariableWidthBuilder extends Builder { protected _pendingLength = 0; - protected _offsets: OffsetsBufferBuilder; + protected _offsets: OffsetsBufferBuilder; protected _pending: Map | undefined; constructor(opts: BuilderOptions) { super(opts); - this._offsets = new OffsetsBufferBuilder(); + this._offsets = new OffsetsBufferBuilder(opts.type); } public setValue(index: number, value: T['TValue']) { const pending = this._pending || (this._pending = new Map()); diff --git a/js/src/builder/buffer.ts b/js/src/builder/buffer.ts index 03d4f33349a7a..402172059682c 100644 --- a/js/src/builder/buffer.ts +++ b/js/src/builder/buffer.ts @@ -16,32 +16,21 @@ // under the License. import { memcpy } from '../util/buffer.js'; -import { - TypedArray, TypedArrayConstructor, - BigIntArray, BigIntArrayConstructor -} from '../interfaces.js'; - -/** @ignore */ type DataValue = T extends TypedArray ? number : T extends BigIntArray ? WideValue : T; -/** @ignore */ type WideValue = T extends BigIntArray ? bigint | Int32Array | Uint32Array : never; -/** @ignore */ type ArrayCtor = - T extends TypedArray ? TypedArrayConstructor : - T extends BigIntArray ? BigIntArrayConstructor : - any; +import { TypedArray, BigIntArray, ArrayCtor } from '../interfaces.js'; +import { DataType } from '../type.js'; /** @ignore */ -const roundLengthUpToNearest64Bytes = (len: number, BPE: number) => ((((Math.ceil(len) * BPE) + 63) & ~63) || 64) / BPE; +function roundLengthUpToNearest64Bytes(len: number, BPE: number) { + const bytesMinus1 = Math.ceil(len) * BPE - 1; + return ((bytesMinus1 - bytesMinus1 % 64 + 64) || 64) / BPE; +} /** @ignore */ const sliceOrExtendArray = (arr: T, len = 0) => ( arr.length >= len ? arr.subarray(0, len) : memcpy(new (arr.constructor as any)(len), arr, 0) ) as T; /** @ignore */ -export interface BufferBuilder> { - readonly offset: number; -} - -/** @ignore */ -export class BufferBuilder> { +export class BufferBuilder { constructor(buffer: T, stride = 1) { this.buffer = buffer; @@ -64,8 +53,8 @@ export class BufferBuilder 0) { this.length += extra; @@ -97,13 +86,11 @@ export class BufferBuilder extends BufferBuilder { +export class DataBufferBuilder extends BufferBuilder { public last() { return this.get(this.length - 1); } - public get(index: number) { return this.buffer[index]; } - public set(index: number, value: number) { + public get(index: number): T[0] { return this.buffer[index]; } + public set(index: number, value: T[0]) { this.reserve(index - this.length + 1); this.buffer[index * this.stride] = value; return this; @@ -134,15 +121,18 @@ export class BitmapBufferBuilder extends DataBufferBuilder { } /** @ignore */ -export class OffsetsBufferBuilder extends DataBufferBuilder { - constructor(data = new Int32Array(1)) { super(data, 1); } - public append(value: number) { +export class OffsetsBufferBuilder extends DataBufferBuilder { + constructor(type: T) { + super(new type.OffsetArrayType(1), 1); + } + + public append(value: T['TOffsetArray'][0]) { return this.set(this.length - 1, value); } - public set(index: number, value: number) { + public set(index: number, value: T['TOffsetArray'][0]) { const offset = this.length - 1; const buffer = this.reserve(index - offset + 1).buffer; - if (offset < index++) { + if (offset < index++ && offset >= 0) { buffer.fill(buffer[offset], offset, index); } buffer[index] = buffer[index - 1] + value; @@ -150,7 +140,7 @@ export class OffsetsBufferBuilder extends DataBufferBuilder { } public flush(length = this.length - 1) { if (length > this.length) { - this.set(length - 1, 0); + this.set(length - 1, this.BYTES_PER_ELEMENT > 4 ? BigInt(0) : 0); } return super.flush(length + 1); } diff --git a/js/src/builder/largeutf8.ts b/js/src/builder/largeutf8.ts new file mode 100644 index 0000000000000..fddfeaf8e7b17 --- /dev/null +++ b/js/src/builder/largeutf8.ts @@ -0,0 +1,59 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { LargeUtf8 } from '../type.js'; +import { encodeUtf8 } from '../util/utf8.js'; +import { BufferBuilder } from './buffer.js'; +import { VariableWidthBuilder, BuilderOptions } from '../builder.js'; + +/** @ignore */ +export class LargeUtf8Builder extends VariableWidthBuilder { + constructor(opts: BuilderOptions) { + super(opts); + this._values = new BufferBuilder(new Uint8Array(0)); + } + public get byteLength(): number { + let size = this._pendingLength + (this.length * 4); + this._offsets && (size += this._offsets.byteLength); + this._values && (size += this._values.byteLength); + this._nulls && (size += this._nulls.byteLength); + return size; + } + public setValue(index: number, value: string) { + return super.setValue(index, encodeUtf8(value) as any); + } + // @ts-ignore + // TODO: move to largeBinaryBuilder when implemented + // protected _flushPending(pending: Map, pendingLength: number): void { } + protected _flushPending(pending: Map, pendingLength: number) { + const offsets = this._offsets; + const data = this._values.reserve(pendingLength).buffer; + let offset = 0; + for (const [index, value] of pending) { + if (value === undefined) { + offsets.set(index, BigInt(0)); + } else { + const length = value.length; + data.set(value, offset); + offsets.set(index, BigInt(length)); + offset += length; + } + } + } +} + +// (LargeUtf8Builder.prototype as any)._flushPending = (LargeBinaryBuilder.prototype as any)._flushPending; diff --git a/js/src/builder/list.ts b/js/src/builder/list.ts index d83cac8e7b1c6..b2739cd5a3260 100644 --- a/js/src/builder/list.ts +++ b/js/src/builder/list.ts @@ -22,10 +22,10 @@ import { Builder, BuilderOptions, VariableWidthBuilder } from '../builder.js'; /** @ignore */ export class ListBuilder extends VariableWidthBuilder, TNull> { - protected _offsets: OffsetsBufferBuilder; + protected _offsets: OffsetsBufferBuilder>; constructor(opts: BuilderOptions, TNull>) { super(opts); - this._offsets = new OffsetsBufferBuilder(); + this._offsets = new OffsetsBufferBuilder(opts.type); } public addChild(child: Builder, name = '0') { if (this.numChildren > 0) { diff --git a/js/src/data.ts b/js/src/data.ts index 1e9df71cff8a7..145ee9d049cb4 100644 --- a/js/src/data.ts +++ b/js/src/data.ts @@ -17,7 +17,7 @@ import { Vector } from './vector.js'; import { BufferType, Type, UnionMode } from './enum.js'; -import { DataType, strideForType } from './type.js'; +import { DataType, LargeUtf8, strideForType } from './type.js'; import { popcnt_bit_range, truncateBitmap } from './util/bit.js'; // When slicing, we do not know the null count of the sliced range without @@ -30,11 +30,12 @@ import { popcnt_bit_range, truncateBitmap } from './util/bit.js'; /** @ignore */ export type NullBuffer = Uint8Array | null | undefined; /** @ignore */ export type TypeIdsBuffer = Int8Array | ArrayLike | Iterable | undefined; /** @ignore */ export type ValueOffsetsBuffer = Int32Array | ArrayLike | Iterable | undefined; +/** @ignore */ export type LargeValueOffsetsBuffer = BigInt64Array | ArrayLike | Iterable | undefined; /** @ignore */ export type DataBuffer = T['TArray'] | ArrayLike | Iterable | undefined; /** @ignore */ export interface Buffers { - [BufferType.OFFSET]: Int32Array; + [BufferType.OFFSET]: T['TOffsetArray']; [BufferType.DATA]: T['TArray']; [BufferType.VALIDITY]: Uint8Array; [BufferType.TYPE]: T['TArray']; @@ -264,7 +265,7 @@ import { } from './type.js'; import { Visitor } from './visitor.js'; -import { toArrayBufferView, toInt32Array, toUint8Array } from './util/buffer.js'; +import { toArrayBufferView, toBigInt64Array, toInt32Array, toUint8Array } from './util/buffer.js'; class MakeDataVisitor extends Visitor { public visit(props: any): Data { @@ -307,6 +308,14 @@ class MakeDataVisitor extends Visitor { const { ['length']: length = valueOffsets.length - 1, ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0 } = props; return new Data(type, offset, length, nullCount, [valueOffsets, data, nullBitmap]); } + public visitLargeUtf8(props: LargeUtf8DataProps) { + const { ['type']: type, ['offset']: offset = 0 } = props; + const data = toUint8Array(props['data']); + const nullBitmap = toUint8Array(props['nullBitmap']); + const valueOffsets = toBigInt64Array(props['valueOffsets']); + const { ['length']: length = valueOffsets.length - 1, ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0 } = props; + return new Data(type, offset, length, nullCount, [valueOffsets, data, nullBitmap]); + } public visitBinary(props: BinaryDataProps) { const { ['type']: type, ['offset']: offset = 0 } = props; const data = toUint8Array(props['data']); @@ -436,6 +445,7 @@ interface DurationDataProps extends DataProps_ { data?: D interface FixedSizeBinaryDataProps extends DataProps_ { data?: DataBuffer } interface BinaryDataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; data?: DataBuffer } interface Utf8DataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; data?: DataBuffer } +interface LargeUtf8DataProps extends DataProps_ { valueOffsets: LargeValueOffsetsBuffer | ValueOffsetsBuffer; data?: DataBuffer } interface ListDataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; child: Data } interface FixedSizeListDataProps extends DataProps_ { child: Data } interface StructDataProps extends DataProps_ { children: Data[] } @@ -459,6 +469,7 @@ export type DataProps = ( T extends FixedSizeBinary /* */ ? FixedSizeBinaryDataProps : T extends Binary /* */ ? BinaryDataProps : T extends Utf8 /* */ ? Utf8DataProps : + T extends LargeUtf8 /* */ ? LargeUtf8DataProps : T extends List /* */ ? ListDataProps : T extends FixedSizeList /* */ ? FixedSizeListDataProps : T extends Struct /* */ ? StructDataProps : @@ -485,6 +496,7 @@ export function makeData(props: DurationDataProps): Data< export function makeData(props: FixedSizeBinaryDataProps): Data; export function makeData(props: BinaryDataProps): Data; export function makeData(props: Utf8DataProps): Data; +export function makeData(props: LargeUtf8DataProps): Data; export function makeData(props: ListDataProps): Data; export function makeData(props: FixedSizeListDataProps): Data; export function makeData(props: StructDataProps): Data; diff --git a/js/src/enum.ts b/js/src/enum.ts index 2a82dd4235c51..764ea64e63338 100644 --- a/js/src/enum.ts +++ b/js/src/enum.ts @@ -137,8 +137,7 @@ export enum MessageHeader { * nested type consisting of other data types, or another data type (e.g. a * timestamp encoded as an int64). * - * **Note**: Only enum values 0-18 (NONE through Duration) are written to an Arrow - * IPC payload. + * **Note**: Only non-negative enum values are written to an Arrow IPC payload. * * The rest of the values are specified here so TypeScript can narrow the type * signatures further beyond the base Arrow Types. The Arrow DataTypes include @@ -175,6 +174,7 @@ export enum Type { FixedSizeList = 16, /** Fixed-size list. Each value occupies the same number of bytes */ Map = 17, /** Map of named logical types */ Duration = 18, /** Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds. */ + LargeUtf8 = 20, /** Large variable-length string as List */ Dictionary = -1, /** Dictionary aka Category type */ Int8 = -2, @@ -205,7 +205,7 @@ export enum Type { DurationSecond = -27, DurationMillisecond = -28, DurationMicrosecond = -29, - DurationNanosecond = -30 + DurationNanosecond = -30, } export enum BufferType { diff --git a/js/src/interfaces.ts b/js/src/interfaces.ts index 95c5adbb2a25e..707d01bb14cca 100644 --- a/js/src/interfaces.ts +++ b/js/src/interfaces.ts @@ -33,6 +33,7 @@ import type { TimestampBuilder, TimestampSecondBuilder, TimestampMillisecondBuil import type { IntervalBuilder, IntervalDayTimeBuilder, IntervalYearMonthBuilder } from './builder/interval.js'; import type { DurationBuilder, DurationSecondBuilder, DurationMillisecondBuilder, DurationMicrosecondBuilder, DurationNanosecondBuilder } from './builder/duration.js'; import type { Utf8Builder } from './builder/utf8.js'; +import type { LargeUtf8Builder } from './builder/largeutf8.js'; import type { BinaryBuilder } from './builder/binary.js'; import type { ListBuilder } from './builder/list.js'; import type { FixedSizeListBuilder } from './builder/fixedsizelist.js'; @@ -98,6 +99,12 @@ export interface BigIntArrayConstructor { from(arrayLike: ArrayLike, mapfn: (v: U, k: number) => bigint, thisArg?: any): T; } +/** @ignore */ +export type ArrayCtor = + T extends TypedArray ? TypedArrayConstructor : + T extends BigIntArray ? BigIntArrayConstructor : + any; + /** @ignore */ export type BuilderCtorArgs< T extends BuilderType, @@ -105,7 +112,7 @@ export type BuilderCtorArgs< TArgs extends any[] = any[], TCtor extends new (type: R, ...args: TArgs) => T = new (type: R, ...args: TArgs) => T - > = TCtor extends new (type: R, ...args: infer TArgs) => T ? TArgs : never; +> = TCtor extends new (type: R, ...args: infer TArgs) => T ? TArgs : never; /** * Obtain the constructor function of an instance type @@ -115,7 +122,7 @@ export type ConstructorType< T, TCtor extends new (...args: any[]) => T = new (...args: any[]) => T - > = TCtor extends new (...args: any[]) => T ? TCtor : never; +> = TCtor extends new (...args: any[]) => T ? TCtor : never; /** @ignore */ export type BuilderCtorType< @@ -123,7 +130,7 @@ export type BuilderCtorType< R extends DataType = any, TCtor extends new (options: BuilderOptions) => T = new (options: BuilderOptions) => T - > = TCtor extends new (options: BuilderOptions) => T ? TCtor : never; +> = TCtor extends new (options: BuilderOptions) => T ? TCtor : never; /** @ignore */ export type BuilderType = @@ -201,6 +208,7 @@ export type TypeToDataType = { [Type.Float64]: type.Float64; [Type.Float]: type.Float; [Type.Utf8]: type.Utf8; + [Type.LargeUtf8]: type.LargeUtf8; [Type.Binary]: type.Binary; [Type.FixedSizeBinary]: type.FixedSizeBinary; [Type.Date]: type.Date_; @@ -254,6 +262,7 @@ type TypeToBuilder = { [Type.Float64]: Float64Builder; [Type.Float]: FloatBuilder; [Type.Utf8]: Utf8Builder; + [Type.LargeUtf8]: LargeUtf8Builder; [Type.Binary]: BinaryBuilder; [Type.FixedSizeBinary]: FixedSizeBinaryBuilder; [Type.Date]: DateBuilder; @@ -307,6 +316,7 @@ type DataTypeToBuilder = { [Type.Float64]: T extends type.Float64 ? Float64Builder : never; [Type.Float]: T extends type.Float ? FloatBuilder : never; [Type.Utf8]: T extends type.Utf8 ? Utf8Builder : never; + [Type.LargeUtf8]: T extends type.LargeUtf8 ? LargeUtf8Builder : never; [Type.Binary]: T extends type.Binary ? BinaryBuilder : never; [Type.FixedSizeBinary]: T extends type.FixedSizeBinary ? FixedSizeBinaryBuilder : never; [Type.Date]: T extends type.Date_ ? DateBuilder : never; @@ -329,11 +339,11 @@ type DataTypeToBuilder = { [Type.Interval]: T extends type.Interval ? IntervalBuilder : never; [Type.IntervalDayTime]: T extends type.IntervalDayTime ? IntervalDayTimeBuilder : never; [Type.IntervalYearMonth]: T extends type.IntervalYearMonth ? IntervalYearMonthBuilder : never; - [Type.Duration]: T extends type.Duration ? DurationBuilder: never; + [Type.Duration]: T extends type.Duration ? DurationBuilder : never; [Type.DurationSecond]: T extends type.DurationSecond ? DurationSecondBuilder : never; [Type.DurationMillisecond]: T extends type.DurationMillisecond ? DurationMillisecondBuilder : never; - [Type.DurationMicrosecond]: T extends type.DurationMicrosecond ? DurationMicrosecondBuilder: never; - [Type.DurationNanosecond]: T extends type.DurationNanosecond ? DurationNanosecondBuilder: never; + [Type.DurationMicrosecond]: T extends type.DurationMicrosecond ? DurationMicrosecondBuilder : never; + [Type.DurationNanosecond]: T extends type.DurationNanosecond ? DurationNanosecondBuilder : never; [Type.Map]: T extends type.Map_ ? MapBuilder : never; [Type.List]: T extends type.List ? ListBuilder : never; [Type.Struct]: T extends type.Struct ? StructBuilder : never; diff --git a/js/src/ipc/metadata/json.ts b/js/src/ipc/metadata/json.ts index f1f306730ddba..b669c0c612f8a 100644 --- a/js/src/ipc/metadata/json.ts +++ b/js/src/ipc/metadata/json.ts @@ -20,7 +20,7 @@ import { Schema, Field } from '../../schema.js'; import { DataType, Dictionary, TimeBitWidth, - Utf8, Binary, Decimal, FixedSizeBinary, + Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Union, Bool, Null, Int, Float, Date_, Time, Interval, Timestamp, IntBitWidth, Int32, TKeys, Duration, } from '../../type.js'; @@ -150,6 +150,7 @@ function typeFromJSON(f: any, children?: Field[]): DataType { case 'null': return new Null(); case 'binary': return new Binary(); case 'utf8': return new Utf8(); + case 'largeutf8': return new LargeUtf8(); case 'bool': return new Bool(); case 'list': return new List((children || [])[0]); case 'struct': return new Struct(children || []); diff --git a/js/src/ipc/metadata/message.ts b/js/src/ipc/metadata/message.ts index 27c9b92d6897b..cf05bff54cfba 100644 --- a/js/src/ipc/metadata/message.ts +++ b/js/src/ipc/metadata/message.ts @@ -56,7 +56,7 @@ import ByteBuffer = flatbuffers.ByteBuffer; import { DataType, Dictionary, TimeBitWidth, - Utf8, Binary, Decimal, FixedSizeBinary, + Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Union, Bool, Null, Int, Float, Date_, Time, Interval, Timestamp, IntBitWidth, Int32, TKeys, Duration, } from '../../type.js'; @@ -433,6 +433,7 @@ function decodeFieldType(f: _Field, children?: Field[]): DataType { case Type['Null']: return new Null(); case Type['Binary']: return new Binary(); case Type['Utf8']: return new Utf8(); + case Type['LargeUtf8']: return new LargeUtf8(); case Type['Bool']: return new Bool(); case Type['List']: return new List((children || [])[0]); case Type['Struct_']: return new Struct(children || []); diff --git a/js/src/type.ts b/js/src/type.ts index 34bbf45bca728..6223d0316f17a 100644 --- a/js/src/type.ts +++ b/js/src/type.ts @@ -19,7 +19,7 @@ import { Field } from './schema.js'; import { Vector } from './vector.js'; import { MapRow } from './row/map.js'; import { StructRow, StructRowProxy } from './row/struct.js'; -import { TypedArrayConstructor } from './interfaces.js'; +import { ArrayCtor, BigIntArrayConstructor, TypedArrayConstructor } from './interfaces.js'; import { bigIntToNumber } from './util/bigint.js'; import { @@ -38,9 +38,11 @@ export type IsSigned = { 'true': true; 'false': false }; export interface DataType { readonly TType: TType; readonly TArray: any; + readonly TOffsetArray: any; readonly TValue: any; readonly TChildren: TChildren; readonly ArrayType: any; + readonly OffsetArrayType: ArrayCtor; readonly children: Field[]; } @@ -57,6 +59,7 @@ export abstract class DataType { (proto).children = null; (proto).ArrayType = Array; + (proto).OffsetArrayType = Int32Array; return proto[Symbol.toStringTag] = 'DataType'; })(DataType.prototype); } @@ -232,7 +236,7 @@ Object.defineProperty(Float32.prototype, 'ArrayType', { value: Float32Array }); Object.defineProperty(Float64.prototype, 'ArrayType', { value: Float64Array }); /** @ignore */ -export interface Binary extends DataType { TArray: Uint8Array; TValue: Uint8Array; ArrayType: TypedArrayConstructor } +export interface Binary extends DataType { TArray: Uint8Array; TOffsetArray: Int32Array; TValue: Uint8Array; ArrayType: TypedArrayConstructor; OffsetArrayType: TypedArrayConstructor } /** @ignore */ export class Binary extends DataType { constructor() { @@ -247,7 +251,7 @@ export class Binary extends DataType { } /** @ignore */ -export interface Utf8 extends DataType { TArray: Uint8Array; TValue: string; ArrayType: TypedArrayConstructor } +export interface Utf8 extends DataType { TArray: Uint8Array; TOffsetArray: Int32Array; TValue: string; ArrayType: TypedArrayConstructor; OffsetArrayType: TypedArrayConstructor } /** @ignore */ export class Utf8 extends DataType { constructor() { @@ -261,6 +265,22 @@ export class Utf8 extends DataType { })(Utf8.prototype); } +/** @ignore */ +export interface LargeUtf8 extends DataType { TArray: Uint8Array; TOffsetArray: BigInt64Array; TValue: string; ArrayType: TypedArrayConstructor; OffsetArrayType: BigIntArrayConstructor } +/** @ignore */ +export class LargeUtf8 extends DataType { + constructor() { + super(); + } + public get typeId() { return Type.LargeUtf8 as Type.LargeUtf8; } + public toString() { return `LargeUtf8`; } + protected static [Symbol.toStringTag] = ((proto: LargeUtf8) => { + (proto).ArrayType = Uint8Array; + (proto).OffsetArrayType = BigInt64Array; + return proto[Symbol.toStringTag] = 'LargeUtf8'; + })(LargeUtf8.prototype); +} + /** @ignore */ export interface Bool extends DataType { TArray: Uint8Array; TValue: boolean; ArrayType: TypedArrayConstructor } /** @ignore */ @@ -458,13 +478,13 @@ export class Duration extends DataType { } /** @ignore */ -export class DurationSecond extends Duration { constructor() { super(TimeUnit.SECOND); }} +export class DurationSecond extends Duration { constructor() { super(TimeUnit.SECOND); } } /** @ignore */ -export class DurationMillisecond extends Duration { constructor() { super(TimeUnit.MILLISECOND); }} +export class DurationMillisecond extends Duration { constructor() { super(TimeUnit.MILLISECOND); } } /** @ignore */ -export class DurationMicrosecond extends Duration { constructor() { super(TimeUnit.MICROSECOND); }} +export class DurationMicrosecond extends Duration { constructor() { super(TimeUnit.MICROSECOND); } } /** @ignore */ -export class DurationNanosecond extends Duration { constructor() { super(TimeUnit.NANOSECOND); }} +export class DurationNanosecond extends Duration { constructor() { super(TimeUnit.NANOSECOND); } } /** @ignore */ @@ -581,6 +601,7 @@ export class FixedSizeBinary extends DataType { protected static [Symbol.toStringTag] = ((proto: FixedSizeBinary) => { (proto).byteWidth = null; (proto).ArrayType = Uint8Array; + (proto).OffsetArrayType = Int32Array; return proto[Symbol.toStringTag] = 'FixedSizeBinary'; })(FixedSizeBinary.prototype); } diff --git a/js/src/util/buffer.ts b/js/src/util/buffer.ts index dd8edf11f9258..4f4379dedf6d8 100644 --- a/js/src/util/buffer.ts +++ b/js/src/util/buffer.ts @@ -83,9 +83,9 @@ export function joinUint8Arrays(chunks: Uint8Array[], size?: number | null): [Ui } /** @ignore */ -export type ArrayBufferViewInput = ArrayBufferView | ArrayBufferLike | ArrayBufferView | Iterable | ArrayLike | ByteBuffer | string | null | undefined | - IteratorResult | ArrayLike | ByteBuffer | string | null | undefined> | - ReadableStreamReadResult | ArrayLike | ByteBuffer | string | null | undefined>; +export type ArrayBufferViewInput = ArrayBufferView | ArrayBufferLike | ArrayBufferView | Iterable | Iterable | ArrayLike | ArrayLike | ByteBuffer | string | null | undefined | + IteratorResult | Iterable | ArrayLike | ArrayLike | ByteBuffer | string | null | undefined> | + ReadableStreamReadResult | Iterable | ArrayLike | ArrayLike | ByteBuffer | string | null | undefined>; /** @ignore */ export function toArrayBufferView< @@ -208,7 +208,9 @@ export async function* toArrayBufferViewAsyncIterator(Arra /** @ignore */ export const toUint8ClampedArrayAsyncIterator = (input: ArrayBufferViewAsyncIteratorInput) => toArrayBufferViewAsyncIterator(Uint8ClampedArray, input); /** @ignore */ -export function rebaseValueOffsets(offset: number, length: number, valueOffsets: Int32Array) { +export function rebaseValueOffsets(offset: number, length: number, valueOffsets: Int32Array): Int32Array; +export function rebaseValueOffsets(offset: number, length: number, valueOffsets: BigInt64Array): BigInt64Array; +export function rebaseValueOffsets(offset: number, length: number, valueOffsets: any) { // If we have a non-zero offset, create a new offsets array with the values // shifted by the start offset, such that the new start offset is 0 if (offset !== 0) { diff --git a/js/src/visitor.ts b/js/src/visitor.ts index c63640b038e47..5b3cc4d3d0593 100644 --- a/js/src/visitor.ts +++ b/js/src/visitor.ts @@ -36,6 +36,7 @@ export abstract class Visitor { public visitInt(_node: any, ..._args: any[]): any { return null; } public visitFloat(_node: any, ..._args: any[]): any { return null; } public visitUtf8(_node: any, ..._args: any[]): any { return null; } + public visitLargeUtf8(_node: any, ..._args: any[]): any { return null; } public visitBinary(_node: any, ..._args: any[]): any { return null; } public visitFixedSizeBinary(_node: any, ..._args: any[]): any { return null; } public visitDate(_node: any, ..._args: any[]): any { return null; } @@ -89,6 +90,7 @@ function getVisitFnByTypeId(visitor: Visitor, dtype: Type, throwIfNotFound = tru case Type.Float32: fn = visitor.visitFloat32 || visitor.visitFloat; break; case Type.Float64: fn = visitor.visitFloat64 || visitor.visitFloat; break; case Type.Utf8: fn = visitor.visitUtf8; break; + case Type.LargeUtf8: fn = visitor.visitLargeUtf8; break; case Type.Binary: fn = visitor.visitBinary; break; case Type.FixedSizeBinary: fn = visitor.visitFixedSizeBinary; break; case Type.Date: fn = visitor.visitDate; break; @@ -152,6 +154,7 @@ function inferDType(type: T): Type { return Type.Float; case Type.Binary: return Type.Binary; case Type.Utf8: return Type.Utf8; + case Type.LargeUtf8: return Type.LargeUtf8; case Type.Bool: return Type.Bool; case Type.Decimal: return Type.Decimal; case Type.Time: @@ -229,6 +232,7 @@ export interface Visitor { visitFloat32?(node: any, ...args: any[]): any; visitFloat64?(node: any, ...args: any[]): any; visitUtf8(node: any, ...args: any[]): any; + visitLargeUtf8(node: any, ...args: any[]): any; visitBinary(node: any, ...args: any[]): any; visitFixedSizeBinary(node: any, ...args: any[]): any; visitDate(node: any, ...args: any[]): any; diff --git a/js/src/visitor/builderctor.ts b/js/src/visitor/builderctor.ts index 54b5610a50eed..83374712b2642 100644 --- a/js/src/visitor/builderctor.ts +++ b/js/src/visitor/builderctor.ts @@ -40,6 +40,7 @@ import { TimestampBuilder, TimestampSecondBuilder, TimestampMillisecondBuilder, import { TimeBuilder, TimeSecondBuilder, TimeMillisecondBuilder, TimeMicrosecondBuilder, TimeNanosecondBuilder } from '../builder/time.js'; import { UnionBuilder, DenseUnionBuilder, SparseUnionBuilder } from '../builder/union.js'; import { Utf8Builder } from '../builder/utf8.js'; +import { LargeUtf8Builder } from '../builder/largeutf8.js'; /** @ignore */ export interface GetBuilderCtor extends Visitor { @@ -67,6 +68,7 @@ export class GetBuilderCtor extends Visitor { public visitFloat32() { return Float32Builder; } public visitFloat64() { return Float64Builder; } public visitUtf8() { return Utf8Builder; } + public visitLargeUtf8() { return LargeUtf8Builder; } public visitBinary() { return BinaryBuilder; } public visitFixedSizeBinary() { return FixedSizeBinaryBuilder; } public visitDate() { return DateBuilder; } diff --git a/js/src/visitor/bytelength.ts b/js/src/visitor/bytelength.ts index 72d6148a52fd8..c3bfadd50e155 100644 --- a/js/src/visitor/bytelength.ts +++ b/js/src/visitor/bytelength.ts @@ -26,7 +26,7 @@ import { Type, TimeUnit, UnionMode } from '../enum.js'; import { DataType, Dictionary, Float, Int, Date_, Interval, Time, Timestamp, Duration, - Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, + Bool, Null, Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Union, DenseUnion, SparseUnion, } from '../type.js'; @@ -40,6 +40,7 @@ export interface GetByteLengthVisitor extends Visitor { getVisitFn(node: T): (data: Data>, index: number) => number; visitBinary(data: Data, index: number): number; visitUtf8(data: Data, index: number): number; + visitLargeUtf8(data: Data, index: number): number; visitList(data: Data, index: number): number; visitDenseUnion(data: Data, index: number): number; visitSparseUnion(data: Data, index: number): number; diff --git a/js/src/visitor/get.ts b/js/src/visitor/get.ts index 5aaaedf51a37e..a801c90047c89 100644 --- a/js/src/visitor/get.ts +++ b/js/src/visitor/get.ts @@ -21,6 +21,7 @@ import { Vector } from '../vector.js'; import { Visitor } from '../visitor.js'; import { MapRow } from '../row/map.js'; import { StructRow, StructRowProxy } from '../row/struct.js'; +import { bigIntToNumber } from '../util/bigint.js'; import { decodeUtf8 } from '../util/utf8.js'; import { TypeToDataType } from '../interfaces.js'; import { uint16ToFloat64 } from '../util/math.js'; @@ -35,7 +36,7 @@ import { Time, TimeSecond, TimeMillisecond, TimeMicrosecond, TimeNanosecond, Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, Duration, DurationSecond, DurationMillisecond, DurationMicrosecond, DurationNanosecond, - Union, DenseUnion, SparseUnion, + Union, DenseUnion, SparseUnion, LargeUtf8, } from '../type.js'; /** @ignore */ @@ -60,6 +61,7 @@ export interface GetVisitor extends Visitor { visitFloat32(data: Data, index: number): T['TValue'] | null; visitFloat64(data: Data, index: number): T['TValue'] | null; visitUtf8(data: Data, index: number): T['TValue'] | null; + visitLargeUtf8(data: Data, index: number): T['TValue'] | null; visitBinary(data: Data, index: number): T['TValue'] | null; visitFixedSizeBinary(data: Data, index: number): T['TValue'] | null; visitDate(data: Data, index: number): T['TValue'] | null; @@ -122,6 +124,15 @@ const getVariableWidthBytes = (values: Uint8Array, valueOffsets: Int32Array, ind const y = valueOffsets[index + 1]; return values.subarray(x, y); }; +/** @ignore */ +const getLargeVariableWidthBytes = (values: Uint8Array, valueOffsets: BigInt64Array, index: number) => { + if (index + 1 >= valueOffsets.length) { + return null as any; + } + const x = bigIntToNumber(valueOffsets[index]); + const y = bigIntToNumber(valueOffsets[index + 1]); + return values.subarray(x, y); +}; /** @ignore */ const getBool = ({ offset, values }: Data, index: number): T['TValue'] => { @@ -155,6 +166,11 @@ const getUtf8 = ({ values, valueOffsets }: Data, index: numbe const bytes = getVariableWidthBytes(values, valueOffsets, index); return bytes !== null ? decodeUtf8(bytes) : null as any; }; +/** @ignore */ +const getLargeUtf8 = ({ values, valueOffsets }: Data, index: number): T['TValue'] => { + const bytes = getLargeVariableWidthBytes(values, valueOffsets, index); + return bytes !== null ? decodeUtf8(bytes) : null as any; +}; /* istanbul ignore next */ /** @ignore */ @@ -328,6 +344,7 @@ GetVisitor.prototype.visitFloat16 = wrapGet(getFloat16); GetVisitor.prototype.visitFloat32 = wrapGet(getNumeric); GetVisitor.prototype.visitFloat64 = wrapGet(getNumeric); GetVisitor.prototype.visitUtf8 = wrapGet(getUtf8); +GetVisitor.prototype.visitLargeUtf8 = wrapGet(getLargeUtf8); GetVisitor.prototype.visitBinary = wrapGet(getBinary); GetVisitor.prototype.visitFixedSizeBinary = wrapGet(getFixedSizeBinary); GetVisitor.prototype.visitDate = wrapGet(getDate); diff --git a/js/src/visitor/indexof.ts b/js/src/visitor/indexof.ts index 4cf0076b3c8e2..76f95788c7953 100644 --- a/js/src/visitor/indexof.ts +++ b/js/src/visitor/indexof.ts @@ -24,7 +24,7 @@ import { getBool, BitIterator } from '../util/bit.js'; import { createElementComparator } from '../util/vector.js'; import { DataType, Dictionary, - Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -57,6 +57,7 @@ export interface IndexOfVisitor extends Visitor { visitFloat32(data: Data, value: T['TValue'] | null, index?: number): number; visitFloat64(data: Data, value: T['TValue'] | null, index?: number): number; visitUtf8(data: Data, value: T['TValue'] | null, index?: number): number; + visitLargeUtf8(data: Data, value: T['TValue'] | null, index?: number): number; visitBinary(data: Data, value: T['TValue'] | null, index?: number): number; visitFixedSizeBinary(data: Data, value: T['TValue'] | null, index?: number): number; visitDate(data: Data, value: T['TValue'] | null, index?: number): number; @@ -172,6 +173,7 @@ IndexOfVisitor.prototype.visitFloat16 = indexOfValue; IndexOfVisitor.prototype.visitFloat32 = indexOfValue; IndexOfVisitor.prototype.visitFloat64 = indexOfValue; IndexOfVisitor.prototype.visitUtf8 = indexOfValue; +IndexOfVisitor.prototype.visitLargeUtf8 = indexOfValue; IndexOfVisitor.prototype.visitBinary = indexOfValue; IndexOfVisitor.prototype.visitFixedSizeBinary = indexOfValue; IndexOfVisitor.prototype.visitDate = indexOfValue; diff --git a/js/src/visitor/iterator.ts b/js/src/visitor/iterator.ts index e38bb907695d0..09dfcb0b565ae 100644 --- a/js/src/visitor/iterator.ts +++ b/js/src/visitor/iterator.ts @@ -21,7 +21,7 @@ import { Type, Precision } from '../enum.js'; import { TypeToDataType } from '../interfaces.js'; import { DataType, Dictionary, - Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -55,6 +55,7 @@ export interface IteratorVisitor extends Visitor { visitFloat32(vector: Vector): IterableIterator; visitFloat64(vector: Vector): IterableIterator; visitUtf8(vector: Vector): IterableIterator; + visitLargeUtf8(vector: Vector): IterableIterator; visitBinary(vector: Vector): IterableIterator; visitFixedSizeBinary(vector: Vector): IterableIterator; visitDate(vector: Vector): IterableIterator; @@ -158,6 +159,7 @@ IteratorVisitor.prototype.visitFloat16 = vectorIterator; IteratorVisitor.prototype.visitFloat32 = vectorIterator; IteratorVisitor.prototype.visitFloat64 = vectorIterator; IteratorVisitor.prototype.visitUtf8 = vectorIterator; +IteratorVisitor.prototype.visitLargeUtf8 = vectorIterator; IteratorVisitor.prototype.visitBinary = vectorIterator; IteratorVisitor.prototype.visitFixedSizeBinary = vectorIterator; IteratorVisitor.prototype.visitDate = vectorIterator; diff --git a/js/src/visitor/jsontypeassembler.ts b/js/src/visitor/jsontypeassembler.ts index 6e6cfb07413c3..a6746a858ecb4 100644 --- a/js/src/visitor/jsontypeassembler.ts +++ b/js/src/visitor/jsontypeassembler.ts @@ -48,6 +48,9 @@ export class JSONTypeAssembler extends Visitor { public visitUtf8({ typeId }: T) { return { 'name': ArrowType[typeId].toLowerCase() }; } + public visitLargeUtf8({ typeId }: T) { + return { 'name': ArrowType[typeId].toLowerCase() }; + } public visitDecimal({ typeId, scale, precision, bitWidth }: T) { return { 'name': ArrowType[typeId].toLowerCase(), 'scale': scale, 'precision': precision, 'bitWidth': bitWidth }; } @@ -64,7 +67,7 @@ export class JSONTypeAssembler extends Visitor { return { 'name': ArrowType[typeId].toLowerCase(), 'unit': IntervalUnit[unit] }; } public visitDuration({ typeId, unit }: T) { - return { 'name': ArrowType[typeId].toLocaleLowerCase(), 'unit': TimeUnit[unit]}; + return { 'name': ArrowType[typeId].toLocaleLowerCase(), 'unit': TimeUnit[unit] }; } public visitList({ typeId }: T) { return { 'name': ArrowType[typeId].toLowerCase() }; diff --git a/js/src/visitor/jsonvectorassembler.ts b/js/src/visitor/jsonvectorassembler.ts index 0af954e4adacc..9a3cb8601a434 100644 --- a/js/src/visitor/jsonvectorassembler.ts +++ b/js/src/visitor/jsonvectorassembler.ts @@ -27,7 +27,7 @@ import { BitIterator, getBit, getBool } from '../util/bit.js'; import { DataType, Float, Int, Date_, Interval, Time, Timestamp, Union, Duration, - Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, IntArray, + Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, IntArray, LargeUtf8, } from '../type.js'; /** @ignore */ @@ -42,6 +42,7 @@ export interface JSONVectorAssembler extends Visitor { visitInt(data: Data): { DATA: number[] | string[] }; visitFloat(data: Data): { DATA: number[] }; visitUtf8(data: Data): { DATA: string[]; OFFSET: number[] }; + visitLargeUtf8(data: Data): { DATA: string[]; OFFSET: string[] }; visitBinary(data: Data): { DATA: string[]; OFFSET: number[] }; visitFixedSizeBinary(data: Data): { DATA: string[] }; visitDate(data: Data): { DATA: number[] }; @@ -100,6 +101,9 @@ export class JSONVectorAssembler extends Visitor { public visitUtf8(data: Data) { return { 'DATA': [...new Vector([data])], 'OFFSET': [...data.valueOffsets] }; } + public visitLargeUtf8(data: Data) { + return { 'DATA': [...new Vector([data])], 'OFFSET': [...bigNumsToStrings(data.valueOffsets, 2)] }; + } public visitBinary(data: Data) { return { 'DATA': [...binaryToString(new Vector([data]))], OFFSET: [...data.valueOffsets] }; } @@ -148,7 +152,7 @@ export class JSONVectorAssembler extends Visitor { return { 'DATA': [...data.values] }; } public visitDuration(data: Data) { - return { 'DATA': [...bigNumsToStrings(data.values, 2)]}; + return { 'DATA': [...bigNumsToStrings(data.values, 2)] }; } public visitFixedSizeList(data: Data) { return { diff --git a/js/src/visitor/set.ts b/js/src/visitor/set.ts index 1a0eddc556899..a439ec8311fd6 100644 --- a/js/src/visitor/set.ts +++ b/js/src/visitor/set.ts @@ -19,13 +19,14 @@ import { Data } from '../data.js'; import { Field } from '../schema.js'; import { Vector } from '../vector.js'; import { Visitor } from '../visitor.js'; +import { bigIntToNumber } from '../util/bigint.js'; import { encodeUtf8 } from '../util/utf8.js'; import { TypeToDataType } from '../interfaces.js'; import { float64ToUint16 } from '../util/math.js'; import { Type, UnionMode, Precision, DateUnit, TimeUnit, IntervalUnit } from '../enum.js'; import { DataType, Dictionary, - Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -58,6 +59,7 @@ export interface SetVisitor extends Visitor { visitFloat32(data: Data, index: number, value: T['TValue']): void; visitFloat64(data: Data, index: number, value: T['TValue']): void; visitUtf8(data: Data, index: number, value: T['TValue']): void; + visitLargeUtf8(data: Data, index: number, value: T['TValue']): void; visitBinary(data: Data, index: number, value: T['TValue']): void; visitFixedSizeBinary(data: Data, index: number, value: T['TValue']): void; visitDate(data: Data, index: number, value: T['TValue']): void; @@ -123,9 +125,19 @@ export const setEpochMsToNanosecondsLong = (data: Int32Array, index: number, epo }; /** @ignore */ -export const setVariableWidthBytes = (values: Uint8Array, valueOffsets: Int32Array, index: number, value: Uint8Array) => { +export const setVariableWidthBytes = (values: Uint8Array, valueOffsets: T, index: number, value: Uint8Array) => { if (index + 1 < valueOffsets.length) { - const { [index]: x, [index + 1]: y } = valueOffsets; + const x = valueOffsets[index]; + const y = valueOffsets[index + 1]; + values.set(value.subarray(0, y - x), x); + } +}; + +/** @ignore */ +export const setLargeVariableWidthBytes = (values: Uint8Array, valueOffsets: T, index: number, value: Uint8Array) => { + if (index + 1 < valueOffsets.length) { + const x = bigIntToNumber(valueOffsets[index]); + const y = bigIntToNumber(valueOffsets[index + 1]); values.set(value.subarray(0, y - x), x); } }; @@ -167,6 +179,10 @@ const setBinary = ({ values, valueOffsets }: Data, index: n const setUtf8 = ({ values, valueOffsets }: Data, index: number, value: T['TValue']) => { setVariableWidthBytes(values, valueOffsets, index, encodeUtf8(value)); }; +/** @ignore */ +const setLargeUtf8 = ({ values, valueOffsets }: Data, index: number, value: T['TValue']) => { + setLargeVariableWidthBytes(values, valueOffsets, index, encodeUtf8(value)); +}; /* istanbul ignore next */ export const setDate = (data: Data, index: number, value: T['TValue']): void => { @@ -365,6 +381,7 @@ SetVisitor.prototype.visitFloat16 = wrapSet(setFloat16); SetVisitor.prototype.visitFloat32 = wrapSet(setFloat); SetVisitor.prototype.visitFloat64 = wrapSet(setFloat); SetVisitor.prototype.visitUtf8 = wrapSet(setUtf8); +SetVisitor.prototype.visitLargeUtf8 = wrapSet(setLargeUtf8); SetVisitor.prototype.visitBinary = wrapSet(setBinary); SetVisitor.prototype.visitFixedSizeBinary = wrapSet(setFixedSizeBinary); SetVisitor.prototype.visitDate = wrapSet(setDate); diff --git a/js/src/visitor/typeassembler.ts b/js/src/visitor/typeassembler.ts index c2262d20531b9..f072714222739 100644 --- a/js/src/visitor/typeassembler.ts +++ b/js/src/visitor/typeassembler.ts @@ -27,6 +27,7 @@ import { FloatingPoint } from '../fb/floating-point.js'; import { Binary } from '../fb/binary.js'; import { Bool } from '../fb/bool.js'; import { Utf8 } from '../fb/utf8.js'; +import { LargeUtf8 } from '../fb/large-utf8.js'; import { Decimal } from '../fb/decimal.js'; import { Date } from '../fb/date.js'; import { Time } from '../fb/time.js'; @@ -78,6 +79,10 @@ export class TypeAssembler extends Visitor { Utf8.startUtf8(b); return Utf8.endUtf8(b); } + public visitLargeUtf8(_node: T, b: Builder) { + LargeUtf8.startLargeUtf8(b); + return LargeUtf8.endLargeUtf8(b); + } public visitDecimal(node: T, b: Builder) { Decimal.startDecimal(b); Decimal.addScale(b, node.scale); diff --git a/js/src/visitor/typecomparator.ts b/js/src/visitor/typecomparator.ts index 1de8e218dae4f..2417dec09c6e9 100644 --- a/js/src/visitor/typecomparator.ts +++ b/js/src/visitor/typecomparator.ts @@ -21,7 +21,7 @@ import { Visitor } from '../visitor.js'; import { Schema, Field } from '../schema.js'; import { DataType, TypeMap, Dictionary, - Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -53,6 +53,7 @@ export interface TypeComparator extends Visitor { visitFloat32(type: T, other?: DataType | null): other is T; visitFloat64(type: T, other?: DataType | null): other is T; visitUtf8(type: T, other?: DataType | null): other is T; + visitLargeUtf8(type: T, other?: DataType | null): other is T; visitBinary(type: T, other?: DataType | null): other is T; visitFixedSizeBinary(type: T, other?: DataType | null): other is T; visitDate(type: T, other?: DataType | null): other is T; @@ -249,6 +250,7 @@ TypeComparator.prototype.visitFloat16 = compareFloat; TypeComparator.prototype.visitFloat32 = compareFloat; TypeComparator.prototype.visitFloat64 = compareFloat; TypeComparator.prototype.visitUtf8 = compareAny; +TypeComparator.prototype.visitLargeUtf8 = compareAny; TypeComparator.prototype.visitBinary = compareAny; TypeComparator.prototype.visitFixedSizeBinary = compareFixedSizeBinary; TypeComparator.prototype.visitDate = compareDate; diff --git a/js/src/visitor/typector.ts b/js/src/visitor/typector.ts index 077f66592fbfb..2e0bbc4147abb 100644 --- a/js/src/visitor/typector.ts +++ b/js/src/visitor/typector.ts @@ -49,6 +49,7 @@ export class GetDataTypeConstructor extends Visitor { public visitFloat32() { return type.Float32; } public visitFloat64() { return type.Float64; } public visitUtf8() { return type.Utf8; } + public visitLargeUtf8() { return type.LargeUtf8; } public visitBinary() { return type.Binary; } public visitFixedSizeBinary() { return type.FixedSizeBinary; } public visitDate() { return type.Date_; } diff --git a/js/src/visitor/vectorassembler.ts b/js/src/visitor/vectorassembler.ts index 949463272e718..7a9d3bdd57b0d 100644 --- a/js/src/visitor/vectorassembler.ts +++ b/js/src/visitor/vectorassembler.ts @@ -27,8 +27,9 @@ import { BufferRegion, FieldNode } from '../ipc/metadata/message.js'; import { DataType, Dictionary, Float, Int, Date_, Interval, Time, Timestamp, Union, Duration, - Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, LargeUtf8, } from '../type.js'; +import { bigIntToNumber } from '../util/bigint.js'; /** @ignore */ export interface VectorAssembler extends Visitor { @@ -204,9 +205,29 @@ function assembleFlatVector(this: VectorAssembler, data: Data) { const { length, values, valueOffsets } = data; const { [0]: begin, [length]: end } = valueOffsets; + return _assembleFlatListVector.call(this, length, begin, end, values, valueOffsets); +} + +/** @ignore */ +function assembleLargeFlatListVector(this: VectorAssembler, data: Data) { + const { length, values, valueOffsets } = data; + const begin = bigIntToNumber(valueOffsets[0]); + const end = bigIntToNumber(valueOffsets[length]); + return _assembleFlatListVector.call(this, length, begin, end, values, valueOffsets); +} + +/** @ignore */ +function _assembleFlatListVector( + this: VectorAssembler, + length: number, + begin: number, + end: number, + values: T['TArray'], + valueOffsets: T['TOffsetArray'] +) { const byteLength = Math.min(end - begin, values.byteLength - begin); // Push in the order FlatList types read their buffers - addBuffer.call(this, rebaseValueOffsets(-begin, length + 1, valueOffsets)); // valueOffsets buffer first + addBuffer.call(this, rebaseValueOffsets(-begin, length + 1, valueOffsets as any)); // valueOffsets buffer first addBuffer.call(this, values.subarray(begin, begin + byteLength)); // sliced values buffer second return this; } @@ -234,6 +255,7 @@ VectorAssembler.prototype.visitBool = assembleBoolVector; VectorAssembler.prototype.visitInt = assembleFlatVector; VectorAssembler.prototype.visitFloat = assembleFlatVector; VectorAssembler.prototype.visitUtf8 = assembleFlatListVector; +VectorAssembler.prototype.visitLargeUtf8 = assembleLargeFlatListVector; VectorAssembler.prototype.visitBinary = assembleFlatListVector; VectorAssembler.prototype.visitFixedSizeBinary = assembleFlatVector; VectorAssembler.prototype.visitDate = assembleFlatVector; diff --git a/js/src/visitor/vectorloader.ts b/js/src/visitor/vectorloader.ts index db34edad9a1c1..35f28f49baada 100644 --- a/js/src/visitor/vectorloader.ts +++ b/js/src/visitor/vectorloader.ts @@ -71,6 +71,9 @@ export class VectorLoader extends Visitor { public visitUtf8(type: T, { length, nullCount } = this.nextFieldNode()) { return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), valueOffsets: this.readOffsets(type), data: this.readData(type) }); } + public visitLargeUtf8(type: T, { length, nullCount } = this.nextFieldNode()) { + return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), valueOffsets: this.readOffsets(type), data: this.readData(type) }); + } public visitBinary(type: T, { length, nullCount } = this.nextFieldNode()) { return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), valueOffsets: this.readOffsets(type), data: this.readData(type) }); } @@ -151,7 +154,7 @@ export class JSONVectorLoader extends VectorLoader { return nullCount <= 0 ? new Uint8Array(0) : packBools(this.sources[offset]); } protected readOffsets(_type: T, { offset } = this.nextBufferRange()) { - return toArrayBufferView(Uint8Array, toArrayBufferView(Int32Array, this.sources[offset])); + return toArrayBufferView(Uint8Array, toArrayBufferView(_type.OffsetArrayType, this.sources[offset])); } protected readTypeIds(type: T, { offset } = this.nextBufferRange()) { return toArrayBufferView(Uint8Array, toArrayBufferView(type.ArrayType, this.sources[offset])); @@ -170,7 +173,7 @@ export class JSONVectorLoader extends VectorLoader { return binaryDataFromJSON(sources[offset] as string[]); } else if (DataType.isBool(type)) { return packBools(sources[offset] as number[]); - } else if (DataType.isUtf8(type)) { + } else if (DataType.isUtf8(type) || DataType.isLargeUtf8(type)) { return encodeUtf8((sources[offset] as string[]).join('')); } return toArrayBufferView(Uint8Array, toArrayBufferView(type.ArrayType, sources[offset].map((x) => +x))); diff --git a/js/test/data/tables.ts b/js/test/data/tables.ts index 28aed7e4feccf..449cfe1fb853a 100644 --- a/js/test/data/tables.ts +++ b/js/test/data/tables.ts @@ -27,7 +27,7 @@ const nestedVectorGeneratorNames = ['struct', 'denseUnion', 'sparseUnion', 'map' const dictionaryKeyGeneratorNames = ['int8', 'int16', 'int32', 'uint8', 'uint16', 'uint32']; const valueVectorGeneratorNames = [ 'null_', 'bool', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64', - 'float16', 'float32', 'float64', 'utf8', 'binary', 'fixedSizeBinary', 'dateDay', 'dateMillisecond', + 'float16', 'float32', 'float64', 'utf8', 'largeUtf8', 'binary', 'fixedSizeBinary', 'dateDay', 'dateMillisecond', 'timestampSecond', 'timestampMillisecond', 'timestampMicrosecond', 'timestampNanosecond', 'timeSecond', 'timeMillisecond', 'timeMicrosecond', 'timeNanosecond', 'decimal', 'dictionary', 'intervalDayTime', 'intervalYearMonth', diff --git a/js/test/generate-test-data.ts b/js/test/generate-test-data.ts index 15fb715a31f95..9d7b038331fe6 100644 --- a/js/test/generate-test-data.ts +++ b/js/test/generate-test-data.ts @@ -24,7 +24,7 @@ import { Bool, Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64, Float, Float16, Float32, Float64, - Utf8, + Utf8, LargeUtf8, Binary, FixedSizeBinary, Date_, DateDay, DateMillisecond, @@ -52,6 +52,7 @@ interface TestDataVectorGenerator extends Visitor { visit(type: T, length?: number, nullCount?: number): GeneratedVector; visit(type: T, length?: number, nullCount?: number): GeneratedVector; visit(type: T, length?: number, nullCount?: number): GeneratedVector; + visit(type: T, length?: number, nullCount?: number): GeneratedVector; visit(type: T, length?: number, nullCount?: number): GeneratedVector; visit(type: T, length?: number, nullCount?: number): GeneratedVector; visit(type: T, length?: number, nullCount?: number): GeneratedVector; @@ -75,6 +76,7 @@ interface TestDataVectorGenerator extends Visitor { visitUint64: typeof generateBigInt; visitFloat: typeof generateFloat; visitUtf8: typeof generateUtf8; + visitLargeUtf8: typeof generateLargeUtf8; visitBinary: typeof generateBinary; visitFixedSizeBinary: typeof generateFixedSizeBinary; visitDate: typeof generateDate; @@ -100,6 +102,7 @@ TestDataVectorGenerator.prototype.visitInt64 = generateBigInt; TestDataVectorGenerator.prototype.visitUint64 = generateBigInt; TestDataVectorGenerator.prototype.visitFloat = generateFloat; TestDataVectorGenerator.prototype.visitUtf8 = generateUtf8; +TestDataVectorGenerator.prototype.visitLargeUtf8 = generateLargeUtf8; TestDataVectorGenerator.prototype.visitBinary = generateBinary; TestDataVectorGenerator.prototype.visitFixedSizeBinary = generateFixedSizeBinary; TestDataVectorGenerator.prototype.visitDate = generateDate; @@ -214,6 +217,7 @@ export const float16 = (length = 100, nullCount = Math.trunc(length * 0.2)) => v export const float32 = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new Float32(), length, nullCount); export const float64 = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new Float64(), length, nullCount); export const utf8 = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new Utf8(), length, nullCount); +export const largeUtf8 = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new LargeUtf8(), length, nullCount); export const binary = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new Binary(), length, nullCount); export const fixedSizeBinary = (length = 100, nullCount = Math.trunc(length * 0.2), byteWidth = 8) => vectorGenerator.visit(new FixedSizeBinary(byteWidth), length, nullCount); export const dateDay = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new DateDay(), length, nullCount); @@ -242,7 +246,7 @@ export const fixedSizeList = (length = 100, nullCount = Math.trunc(length * 0.2) export const map = (length = 100, nullCount = Math.trunc(length * 0.2), child: Field> = defaultMapChild()) => vectorGenerator.visit(new Map_(child), length, nullCount); export const vecs = { - null_, bool, int8, int16, int32, int64, uint8, uint16, uint32, uint64, float16, float32, float64, utf8, binary, fixedSizeBinary, dateDay, dateMillisecond, timestampSecond, timestampMillisecond, timestampMicrosecond, timestampNanosecond, timeSecond, timeMillisecond, timeMicrosecond, timeNanosecond, decimal, list, struct, denseUnion, sparseUnion, dictionary, intervalDayTime, intervalYearMonth, fixedSizeList, map, durationSecond, durationMillisecond, durationMicrosecond, durationNanosecond + null_, bool, int8, int16, int32, int64, uint8, uint16, uint32, uint64, float16, float32, float64, utf8, largeUtf8, binary, fixedSizeBinary, dateDay, dateMillisecond, timestampSecond, timestampMillisecond, timestampMicrosecond, timestampNanosecond, timeSecond, timeMillisecond, timeMicrosecond, timeNanosecond, decimal, list, struct, denseUnion, sparseUnion, dictionary, intervalDayTime, intervalYearMonth, fixedSizeList, map, durationSecond, durationMillisecond, durationMicrosecond, durationNanosecond } as { [k: string]: (...args: any[]) => any }; function generateNull(this: TestDataVectorGenerator, type: T, length = 100): GeneratedVector { @@ -312,7 +316,7 @@ function generateFloat(this: TestDataVectorGenerator, type: T, function generateUtf8(this: TestDataVectorGenerator, type: T, length = 100, nullCount = Math.trunc(length * 0.2)): GeneratedVector { const nullBitmap = createBitmap(length, nullCount); - const valueOffsets = createVariableWidthOffsets(length, nullBitmap, 10, 20, nullCount != 0); + const valueOffsets = createVariableWidthOffsets32(length, nullBitmap, 10, 20, nullCount != 0); const values: string[] = new Array(valueOffsets.length - 1).fill(null); [...valueOffsets.slice(1)] .map((o, i) => isValid(nullBitmap, i) ? o - valueOffsets[i] : null) @@ -332,9 +336,31 @@ function generateUtf8(this: TestDataVectorGenerator, type: T, le return { values: () => values, vector: new Vector([makeData({ type, length, nullCount, nullBitmap, valueOffsets, data })]) }; } +function generateLargeUtf8(this: TestDataVectorGenerator, type: T, length = 100, nullCount = Math.trunc(length * 0.2)): GeneratedVector { + const nullBitmap = createBitmap(length, nullCount); + const valueOffsets = createVariableWidthOffsets64(length, nullBitmap, 10, 20, nullCount != 0); + const values: string[] = new Array(valueOffsets.length - 1).fill(null); + [...valueOffsets.slice(1)] + .map((o, i) => isValid(nullBitmap, i) ? o - valueOffsets[i] : null) + .reduce((map, length, i) => { + if (length !== null) { + if (length > 0) { + do { + values[i] = randomString(Number(length)); + } while (map.has(values[i])); + return map.set(values[i], i); + } + values[i] = ''; + } + return map; + }, new Map()); + const data = createVariableWidthBytes(length, nullBitmap, valueOffsets, (i) => encodeUtf8(values[i])); + return { values: () => values, vector: new Vector([makeData({ type, length, nullCount, nullBitmap, valueOffsets, data })]) }; +} + function generateBinary(this: TestDataVectorGenerator, type: T, length = 100, nullCount = Math.trunc(length * 0.2)): GeneratedVector { const nullBitmap = createBitmap(length, nullCount); - const valueOffsets = createVariableWidthOffsets(length, nullBitmap, 10, 20, nullCount != 0); + const valueOffsets = createVariableWidthOffsets32(length, nullBitmap, 10, 20, nullCount != 0); const values = [...valueOffsets.slice(1)] .map((o, i) => isValid(nullBitmap, i) ? o - valueOffsets[i] : null) .map((length) => length == null ? null : randomBytes(length)); @@ -443,7 +469,7 @@ function generateList(this: TestDataVectorGenerator, type: T, le const childVec = child.vector; const nullBitmap = createBitmap(length, nullCount); const stride = childVec.length / (length - nullCount); - const valueOffsets = createVariableWidthOffsets(length, nullBitmap, stride, stride); + const valueOffsets = createVariableWidthOffsets32(length, nullBitmap, stride, stride); const values = memoize(() => { const childValues = child.values(); const values: (T['valueType'] | null)[] = [...valueOffsets.slice(1)] @@ -581,7 +607,7 @@ function generateMap(this: TestDataVectorGenerator, const childVec = child.vector; const nullBitmap = createBitmap(length, nullCount); const stride = childVec.length / (length - nullCount); - const valueOffsets = createVariableWidthOffsets(length, nullBitmap, stride, stride); + const valueOffsets = createVariableWidthOffsets32(length, nullBitmap, stride, stride); const values = memoize(() => { const childValues: { key: K; value: V }[] = child.values(); const values: (Record | null)[] = [...valueOffsets.slice(1)] @@ -660,7 +686,7 @@ function createBitmap(length: number, nullCount: number) { return bytes; } -function createVariableWidthOffsets(length: number, nullBitmap: Uint8Array, min = 10, max = Number.POSITIVE_INFINITY, allowEmpty = true) { +function createVariableWidthOffsets32(length: number, nullBitmap: Uint8Array, min = 10, max = Number.POSITIVE_INFINITY, allowEmpty = true) { const offsets = new Int32Array(length + 1); iterateBitmap(length, nullBitmap, (i, valid) => { if (!valid) { @@ -674,10 +700,24 @@ function createVariableWidthOffsets(length: number, nullBitmap: Uint8Array, min return offsets; } -function createVariableWidthBytes(length: number, nullBitmap: Uint8Array, offsets: Int32Array, getBytes: (index: number) => Uint8Array) { - const bytes = new Uint8Array(offsets[length]); +function createVariableWidthOffsets64(length: number, nullBitmap: Uint8Array, min = 10, max = Number.POSITIVE_INFINITY, allowEmpty = true) { + const offsets = new BigInt64Array(length + 1); + iterateBitmap(length, nullBitmap, (i, valid) => { + if (!valid) { + offsets[i + 1] = offsets[i]; + } else { + do { + offsets[i + 1] = offsets[i] + BigInt(Math.min(max, Math.max(min, Math.trunc(rand() * max)))); + } while (!allowEmpty && offsets[i + 1] === offsets[i]); + } + }); + return offsets; +} + +function createVariableWidthBytes(length: number, nullBitmap: Uint8Array, offsets: Int32Array | BigInt64Array, getBytes: (index: number) => Uint8Array) { + const bytes = new Uint8Array(Number(offsets[length])); iterateBitmap(length, nullBitmap, (i, valid) => { - valid && bytes.set(getBytes(i), offsets[i]); + valid && bytes.set(getBytes(i), Number(offsets[i])); }); return bytes; } diff --git a/js/test/unit/builders/builder-tests.ts b/js/test/unit/builders/builder-tests.ts index b261e4f815e3a..0137c7aa66635 100644 --- a/js/test/unit/builders/builder-tests.ts +++ b/js/test/unit/builders/builder-tests.ts @@ -44,6 +44,7 @@ describe('Generated Test Data', () => { describe('Float32Builder', () => { validateBuilder(generate.float32); }); describe('Float64Builder', () => { validateBuilder(generate.float64); }); describe('Utf8Builder', () => { validateBuilder(generate.utf8); }); + describe('LargeUtf8Builder', () => { validateBuilder(generate.largeUtf8); }); describe('BinaryBuilder', () => { validateBuilder(generate.binary); }); describe('FixedSizeBinaryBuilder', () => { validateBuilder(generate.fixedSizeBinary); }); describe('DateDayBuilder', () => { validateBuilder(generate.dateDay); }); diff --git a/js/test/unit/builders/largeUtf8-tests.ts b/js/test/unit/builders/largeUtf8-tests.ts new file mode 100644 index 0000000000000..c789d5dbb1671 --- /dev/null +++ b/js/test/unit/builders/largeUtf8-tests.ts @@ -0,0 +1,65 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import 'web-streams-polyfill'; + +import { validateVector } from './utils.js'; +import { + encodeAll, + encodeEach, + encodeEachDOM, + encodeEachNode, + stringsNoNulls, + stringsWithNAs, + stringsWithNulls, + stringsWithEmpties +} from './utils.js'; + +import { Vector, LargeUtf8 } from 'apache-arrow'; + +const testDOMStreams = process.env.TEST_DOM_STREAMS === 'true'; +const testNodeStreams = process.env.TEST_NODE_STREAMS === 'true'; + +describe('LargeUtf8Builder', () => { + runTestsWithEncoder('encodeAll', encodeAll(() => new LargeUtf8())); + runTestsWithEncoder('encodeEach: 5', encodeEach(() => new LargeUtf8(), 5)); + runTestsWithEncoder('encodeEach: 25', encodeEach(() => new LargeUtf8(), 25)); + runTestsWithEncoder('encodeEach: undefined', encodeEach(() => new LargeUtf8(), void 0)); + testDOMStreams && runTestsWithEncoder('encodeEachDOM: 25', encodeEachDOM(() => new LargeUtf8(), 25)); + testNodeStreams && runTestsWithEncoder('encodeEachNode: 25', encodeEachNode(() => new LargeUtf8(), 25)); +}); + +function runTestsWithEncoder(name: string, encode: (vals: (string | null)[], nullVals?: any[]) => Promise>) { + describe(`${encode.name} ${name}`, () => { + it(`encodes strings no nulls`, async () => { + const vals = stringsNoNulls(20); + validateVector(vals, await encode(vals, []), []); + }); + it(`encodes strings with nulls`, async () => { + const vals = stringsWithNulls(20); + validateVector(vals, await encode(vals, [null]), [null]); + }); + it(`encodes strings using n/a as the null value rep`, async () => { + const vals = stringsWithNAs(20); + validateVector(vals, await encode(vals, ['n/a']), ['n/a']); + }); + it(`encodes strings using \\0 as the null value rep`, async () => { + const vals = stringsWithEmpties(20); + validateVector(vals, await encode(vals, ['\0']), ['\0']); + }); + }); +} diff --git a/js/test/unit/generated-data-tests.ts b/js/test/unit/generated-data-tests.ts index d64c7c188d3ed..0a06bcbab8ee0 100644 --- a/js/test/unit/generated-data-tests.ts +++ b/js/test/unit/generated-data-tests.ts @@ -38,6 +38,7 @@ describe('Generated Test Data', () => { describe('Float32', () => { validateVector(generate.float32()); }); describe('Float64', () => { validateVector(generate.float64()); }); describe('Utf8', () => { validateVector(generate.utf8()); }); + describe('LargeUtf8', () => { validateVector(generate.largeUtf8()); }); describe('Binary', () => { validateVector(generate.binary()); }); describe('FixedSizeBinary', () => { validateVector(generate.fixedSizeBinary()); }); describe('DateDay', () => { validateVector(generate.dateDay()); }); diff --git a/js/test/unit/generated-data-validators.ts b/js/test/unit/generated-data-validators.ts index 52f642d2a6e89..57ee94876c300 100644 --- a/js/test/unit/generated-data-validators.ts +++ b/js/test/unit/generated-data-validators.ts @@ -113,7 +113,9 @@ function vectorTests(values: any[], vector: Vector, keys?: number[]) { expected = values[i]; expect(actual).toArrowCompare(expected); } - } catch (e: any) { throw new Error(`${vector}[${i}]:\n\t${e && e.stack || e}`); } + } catch (e: any) { + throw new Error(`${vector}[${i}]:\n\t${e && e.stack || e}`); + } }); if (keys && keys.length > 0) { test(`dictionary indices should match`, () => { @@ -126,7 +128,9 @@ function vectorTests(values: any[], vector: Vector, keys?: number[]) { ? expect(indices.get(i)).toBe(keys[i]) : expect(indices.get(i)).toBeNull(); } - } catch (e) { throw new Error(`${indices}[${i}]: ${e}`); } + } catch (e) { + throw new Error(`${indices}[${i}]: ${e}`); + } }); } test(`sets expected values`, () => { @@ -139,7 +143,9 @@ function vectorTests(values: any[], vector: Vector, keys?: number[]) { actual = vector.get(i); expect(actual).toArrowCompare(expected); } - } catch (e: any) { throw new Error(`${vector}[${i}]:\n\t${e && e.stack || e}`); } + } catch (e: any) { + throw new Error(`${vector}[${i}]:\n\t${e && e.stack || e}`); + } }); test(`iterates expected values`, () => { expect.hasAssertions(); @@ -149,7 +155,9 @@ function vectorTests(values: any[], vector: Vector, keys?: number[]) { expected = values[++i]; expect(actual).toArrowCompare(expected); } - } catch (e: any) { throw new Error(`${vector}[${i}]:\n\t${e && e.stack || e}`); } + } catch (e: any) { + throw new Error(`${vector}[${i}]:\n\t${e && e.stack || e}`); + } }); test(`indexOf returns expected values`, () => { expect.hasAssertions(); @@ -169,7 +177,9 @@ function vectorTests(values: any[], vector: Vector, keys?: number[]) { expect(vector.indexOf('purple elephants')).toBe(-1); expect(vector.indexOf('whistling wombats')).toBe(-1); expect(vector.indexOf('carnivorous novices')).toBe(-1); - } catch (e: any) { throw new Error(`${vector}[${i}]:\n\t${e && e.stack || e}`); } + } catch (e: any) { + throw new Error(`${vector}[${i}]:\n\t${e && e.stack || e}`); + } }); } diff --git a/js/test/unit/vector/vector-tests.ts b/js/test/unit/vector/vector-tests.ts index a259cbef87772..bfcf0d8547861 100644 --- a/js/test/unit/vector/vector-tests.ts +++ b/js/test/unit/vector/vector-tests.ts @@ -16,7 +16,7 @@ // under the License. import { - Bool, DateDay, DateMillisecond, Dictionary, Float64, Int32, List, makeVector, Struct, Timestamp, TimeUnit, Utf8, util, Vector, vectorFromArray + Bool, DateDay, DateMillisecond, Dictionary, Float64, Int32, List, makeVector, Struct, Timestamp, TimeUnit, Utf8, LargeUtf8, util, Vector, vectorFromArray } from 'apache-arrow'; describe(`makeVectorFromArray`, () => { @@ -196,6 +196,28 @@ describe(`Utf8Vector`, () => { }); }); +describe(`LargeUtf8Vector`, () => { + const values = ['foo', 'bar', 'baz', 'foo bar', 'bar']; + const vector = vectorFromArray(values, new LargeUtf8); + + test(`has largeUtf8 type`, () => { + expect(vector.type).toBeInstanceOf(LargeUtf8); + }); + + test(`is not memoized`, () => { + expect(vector.isMemoized).toBe(false); + const memoizedVector = vector.memoize(); + expect(memoizedVector.isMemoized).toBe(true); + const unMemoizedVector = vector.unmemoize(); + expect(unMemoizedVector.isMemoized).toBe(false); + }); + + basicVectorTests(vector, values, ['abc', '123']); + describe(`sliced`, () => { + basicVectorTests(vector.slice(1, 3), values.slice(1, 3), ['foo', 'abc']); + }); +}); + describe(`ListVector`, () => { const values = [[1, 2], [1, 2, 3]]; const vector = vectorFromArray(values); diff --git a/js/test/unit/visitor-tests.ts b/js/test/unit/visitor-tests.ts index 8a7ba1ed778aa..f78adc59f8e98 100644 --- a/js/test/unit/visitor-tests.ts +++ b/js/test/unit/visitor-tests.ts @@ -18,7 +18,7 @@ import { Field, Visitor, DataType, Dictionary, - Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -36,6 +36,7 @@ class BasicVisitor extends Visitor { public visitInt(type: T) { return (this.type = type); } public visitFloat(type: T) { return (this.type = type); } public visitUtf8(type: T) { return (this.type = type); } + public visitLargeUtf8(type: T) { return (this.type = type); } public visitBinary(type: T) { return (this.type = type); } public visitFixedSizeBinary(type: T) { return (this.type = type); } public visitDate(type: T) { return (this.type = type); } @@ -68,6 +69,7 @@ class FeatureVisitor extends Visitor { public visitFloat32(type: T) { return (this.type = type); } public visitFloat64(type: T) { return (this.type = type); } public visitUtf8(type: T) { return (this.type = type); } + public visitLargeUtf8(type: T) { return (this.type = type); } public visitBinary(type: T) { return (this.type = type); } public visitFixedSizeBinary(type: T) { return (this.type = type); } public visitDateDay(type: T) { return (this.type = type); } @@ -104,6 +106,7 @@ describe('Visitor', () => { test(`visits Int types`, () => validateBasicVisitor(new Int(true, 32))); test(`visits Float types`, () => validateBasicVisitor(new Float(0))); test(`visits Utf8 types`, () => validateBasicVisitor(new Utf8())); + test(`visits LargeUtf8 types`, () => validateBasicVisitor(new LargeUtf8())); test(`visits Binary types`, () => validateBasicVisitor(new Binary())); test(`visits FixedSizeBinary types`, () => validateBasicVisitor(new FixedSizeBinary(128))); test(`visits Date types`, () => validateBasicVisitor(new Date_(0))); @@ -144,6 +147,7 @@ describe('Visitor', () => { test(`visits Float32 types`, () => validateFeatureVisitor(new Float32())); test(`visits Float64 types`, () => validateFeatureVisitor(new Float64())); test(`visits Utf8 types`, () => validateFeatureVisitor(new Utf8())); + test(`visits LargeUtf8 types`, () => validateFeatureVisitor(new LargeUtf8())); test(`visits Binary types`, () => validateFeatureVisitor(new Binary())); test(`visits FixedSizeBinary types`, () => validateFeatureVisitor(new FixedSizeBinary(128))); test(`visits DateDay types`, () => validateFeatureVisitor(new DateDay())); From a91a11def5d6dc63463cd5ce0a7027f0174b5ac9 Mon Sep 17 00:00:00 2001 From: Carl Jackson Date: Sat, 16 Dec 2023 11:35:00 -0800 Subject: [PATCH 052/570] GH-37983: [JS] create nullable Fields in Table constructor (#37982) Previously, Tables constructed from vectors with null values would infer Schemas that did not permit null values. This resulted in downstream code making bad assumptions about the data. After this change, we always assume data can be nullable, and construct a Schema with nullable Fields. ### Are these changes tested? I informally tested these changes for my use case, but have not tested them more extensively ### Are there any user-facing changes? Yes: the `Table` constructor will now produce schemas with nullable `Fields` in situations in which it previously did not * Closes: #37983 --- js/src/table.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/js/src/table.ts b/js/src/table.ts index b3633aa9c3015..ef7d09a1d8f44 100644 --- a/js/src/table.ts +++ b/js/src/table.ts @@ -112,7 +112,7 @@ export class Table { } else if (typeof x === 'object') { const keys = Object.keys(x) as (keyof T)[]; const vecs = keys.map((k) => new Vector([x[k]])); - const schema = new Schema(keys.map((k, i) => new Field(String(k), vecs[i].type))); + const schema = new Schema(keys.map((k, i) => new Field(String(k), vecs[i].type, true))); const [, batches] = distributeVectorsIntoRecordBatches(schema, vecs); return batches.length === 0 ? [new RecordBatch(x)] : batches; } From d9183643c86eccc7a620017e00333fb9d555fae0 Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Sat, 16 Dec 2023 14:50:59 -0500 Subject: [PATCH 053/570] Revert "GH-37983: [JS] create nullable Fields in Table constructor (#37982)" (#39253) I missed that the tests were failing. --- js/src/table.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/js/src/table.ts b/js/src/table.ts index ef7d09a1d8f44..b3633aa9c3015 100644 --- a/js/src/table.ts +++ b/js/src/table.ts @@ -112,7 +112,7 @@ export class Table { } else if (typeof x === 'object') { const keys = Object.keys(x) as (keyof T)[]; const vecs = keys.map((k) => new Vector([x[k]])); - const schema = new Schema(keys.map((k, i) => new Field(String(k), vecs[i].type, true))); + const schema = new Schema(keys.map((k, i) => new Field(String(k), vecs[i].type))); const [, batches] = distributeVectorsIntoRecordBatches(schema, vecs); return batches.length === 0 ? [new RecordBatch(x)] : batches; } From e43f575f4d21d66c1585d2b1be9a89963f5129b9 Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Sun, 17 Dec 2023 15:08:38 -0500 Subject: [PATCH 054/570] GH-37983: [JS] Allow nullable fields in table when constructed from vector with nulls (#39254) --- js/src/table.ts | 2 +- js/test/unit/table-tests.ts | 31 ++++++++++++++++++++++--------- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/js/src/table.ts b/js/src/table.ts index b3633aa9c3015..58518257b30cb 100644 --- a/js/src/table.ts +++ b/js/src/table.ts @@ -112,7 +112,7 @@ export class Table { } else if (typeof x === 'object') { const keys = Object.keys(x) as (keyof T)[]; const vecs = keys.map((k) => new Vector([x[k]])); - const schema = new Schema(keys.map((k, i) => new Field(String(k), vecs[i].type))); + const schema = new Schema(keys.map((k, i) => new Field(String(k), vecs[i].type, vecs[i].nullCount > 0))); const [, batches] = distributeVectorsIntoRecordBatches(schema, vecs); return batches.length === 0 ? [new RecordBatch(x)] : batches; } diff --git a/js/test/unit/table-tests.ts b/js/test/unit/table-tests.ts index 50c8565f0f144..6b34124abcaba 100644 --- a/js/test/unit/table-tests.ts +++ b/js/test/unit/table-tests.ts @@ -24,7 +24,7 @@ import { Schema, Field, Table, RecordBatch, Vector, builderThroughIterable, Float32, Int32, Dictionary, Utf8, Int8, - tableFromIPC, tableToIPC + tableFromIPC, tableToIPC, vectorFromArray } from 'apache-arrow'; const deepCopy = (t: Table) => tableFromIPC(tableToIPC(t)); @@ -104,7 +104,7 @@ describe(`Table`, () => { }); describe(`constructor`, () => { - test(`creates an empty Table with Columns`, () => { + test(`creates an empty Table with Vectors`, () => { let i32 = new Vector([makeData({ type: new Int32 })]); let f32 = new Vector([makeData({ type: new Float32 })]); const table = new Table({ i32, f32 }); @@ -117,8 +117,24 @@ describe(`Table`, () => { expect(f32.toArray()).toBeInstanceOf(Float32Array); }); - test(`creates a new Table from a Column`, () => { + test(`creates a Table with Vectors with Nulls`, () => { + const i32s = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, null]; + const i32 = vectorFromArray(i32s); + expect(i32).toHaveLength(i32s.length); + expect(i32.nullCount).toBe(1); + + const table = new Table({ i32 }); + const i32Field = table.schema.fields[0]; + + expect(i32Field.name).toBe('i32'); + expect(i32).toHaveLength(i32s.length); + expect(i32Field.nullable).toBe(true); + expect(i32.nullCount).toBe(1); + + expect(i32).toEqualVector(vectorFromArray(i32s)); + }); + test(`creates a new Table from a Typed Array`, () => { const i32s = new Int32Array(arange(new Array(10))); const i32 = makeVector([i32s]); expect(i32).toHaveLength(i32s.length); @@ -135,8 +151,7 @@ describe(`Table`, () => { expect(i32).toEqualVector(makeVector(i32s)); }); - test(`creates a new Table from Columns`, () => { - + test(`creates a new Table from Typed Arrays`, () => { const i32s = new Int32Array(arange(new Array(10))); const f32s = new Float32Array(arange(new Array(10))); @@ -164,8 +179,7 @@ describe(`Table`, () => { expect(f32).toEqualVector(makeVector(f32s)); }); - test(`creates a new Table from Columns with different lengths`, () => { - + test(`creates a new Table from Typed Arrays with different lengths`, () => { const i32s = new Int32Array(arange(new Array(20))); const f32s = new Float32Array(arange(new Array(8))); @@ -209,8 +223,7 @@ describe(`Table`, () => { expect(f32Vector).toEqualVector(new Vector([f32Expected])); }); - test(`creates a new Table from Columns with different lengths and number of inner chunks`, () => { - + test(`creates a new Table from Typed Arrays with different lengths and number of inner chunks`, () => { const i32s = new Int32Array(arange(new Array(20))); const f32s = new Float32Array(arange(new Array(16))); From 50ace0520748349dccc84f89e04bef0c289a4dba Mon Sep 17 00:00:00 2001 From: Stas Stepanov <78556261+stfdxv@users.noreply.github.com> Date: Mon, 18 Dec 2023 06:02:07 +0300 Subject: [PATCH 055/570] GH-38883: [Docs] Fix struct example to show hiding a child's entry (#38898) ### Rationale for this change See the issue. ### What changes are included in this PR? The struct example now demonstrates what it's supposed to. ### Are these changes tested? Renders well ### Are there any user-facing changes? Just docs. * Closes: #38883 Authored-by: Stas Stepanov <78556261+stfdxv@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- docs/source/format/Columnar.rst | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst index a6632fa2cf81b..56cb27626a1f9 100644 --- a/docs/source/format/Columnar.rst +++ b/docs/source/format/Columnar.rst @@ -664,7 +664,9 @@ type. **Example Layout: ``Struct``** -The layout for ``[{'joe', 1}, {null, 2}, null, {'mark', 4}]`` would be: :: +The layout for ``[{'joe', 1}, {null, 2}, null, {'mark', 4}]``, having +child arrays ``['joe', null, 'alice', 'mark']`` and ``[1, 2, null, 4]`` +would be: :: * Length: 4, Null count: 1 * Validity bitmap buffer: @@ -675,24 +677,24 @@ The layout for ``[{'joe', 1}, {null, 2}, null, {'mark', 4}]`` would be: :: * Children arrays: * field-0 array (`VarBinary`): - * Length: 4, Null count: 2 + * Length: 4, Null count: 1 * Validity bitmap buffer: | Byte 0 (validity bitmap) | Bytes 1-63 | |--------------------------|-----------------------| - | 00001001 | 0 (padding) | + | 00001101 | 0 (padding) | * Offsets buffer: | Bytes 0-19 | Bytes 20-63 | |----------------|-----------------------| - | 0, 3, 3, 3, 7 | unspecified (padding) | + | 0, 3, 3, 8, 12 | unspecified (padding) | * Value buffer: - | Bytes 0-6 | Bytes 7-63 | + | Bytes 0-11 | Bytes 12-63 | |----------------|-----------------------| - | joemark | unspecified (padding) | + | joealicemark | unspecified (padding) | * field-1 array (int32 array): * Length: 4, Null count: 1 @@ -722,10 +724,10 @@ Therefore, to know whether a particular child entry is valid, one must take the logical AND of the corresponding bits in the two validity bitmaps (the struct array's and the child array's). -This is illustrated in the example above, the child arrays have valid entries -for the null struct but they are "hidden" by the struct array's validity -bitmap. However, when treated independently, corresponding entries of the -children array will be non-null. +This is illustrated in the example above, one of the child arrays has a +valid entry ``'alice'`` for the null struct but it is "hidden" by the +struct array's validity bitmap. However, when treated independently, +corresponding entries of the children array will be non-null. Union Layout ------------ From 9c097d504a6acc193a5ce0a4cbf3551c948dcf90 Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Sun, 17 Dec 2023 22:19:54 -0500 Subject: [PATCH 056/570] GH-39248: [JS] Unify code paths for utf8 and largeUtf8 (#39249) Reduce the code size by using common code paths. We only call `Number` a few times on numbers, which should be a noop. * Closes: #39248 --- js/.vscode/settings.json | 2 +- js/src/visitor/get.ts | 20 +++----------------- js/src/visitor/set.ts | 19 +++---------------- js/src/visitor/vectorassembler.ts | 24 +++--------------------- 4 files changed, 10 insertions(+), 55 deletions(-) diff --git a/js/.vscode/settings.json b/js/.vscode/settings.json index 113a662180c3c..e52da54e544ec 100644 --- a/js/.vscode/settings.json +++ b/js/.vscode/settings.json @@ -2,7 +2,7 @@ "typescript.tsdk": "node_modules/typescript/lib", "editor.trimAutoWhitespace": true, "editor.codeActionsOnSave": { - "source.fixAll.eslint": false + "source.fixAll.eslint": "explicit" }, "[javascript]": { "editor.tabSize": 4, diff --git a/js/src/visitor/get.ts b/js/src/visitor/get.ts index a801c90047c89..112d2f2983e53 100644 --- a/js/src/visitor/get.ts +++ b/js/src/visitor/get.ts @@ -116,16 +116,7 @@ function wrapGet(fn: (data: Data, _1: any) => any) { /** @ignore */ const getNull = (_data: Data, _index: number): T['TValue'] => null; /** @ignore */ -const getVariableWidthBytes = (values: Uint8Array, valueOffsets: Int32Array, index: number) => { - if (index + 1 >= valueOffsets.length) { - return null as any; - } - const x = valueOffsets[index]; - const y = valueOffsets[index + 1]; - return values.subarray(x, y); -}; -/** @ignore */ -const getLargeVariableWidthBytes = (values: Uint8Array, valueOffsets: BigInt64Array, index: number) => { +const getVariableWidthBytes = (values: Uint8Array, valueOffsets: Int32Array | BigInt64Array, index: number) => { if (index + 1 >= valueOffsets.length) { return null as any; } @@ -162,15 +153,10 @@ const getFixedSizeBinary = ({ stride, values }: Data< /** @ignore */ const getBinary = ({ values, valueOffsets }: Data, index: number): T['TValue'] => getVariableWidthBytes(values, valueOffsets, index); /** @ignore */ -const getUtf8 = ({ values, valueOffsets }: Data, index: number): T['TValue'] => { +const getUtf8 = ({ values, valueOffsets }: Data, index: number): T['TValue'] => { const bytes = getVariableWidthBytes(values, valueOffsets, index); return bytes !== null ? decodeUtf8(bytes) : null as any; }; -/** @ignore */ -const getLargeUtf8 = ({ values, valueOffsets }: Data, index: number): T['TValue'] => { - const bytes = getLargeVariableWidthBytes(values, valueOffsets, index); - return bytes !== null ? decodeUtf8(bytes) : null as any; -}; /* istanbul ignore next */ /** @ignore */ @@ -344,7 +330,7 @@ GetVisitor.prototype.visitFloat16 = wrapGet(getFloat16); GetVisitor.prototype.visitFloat32 = wrapGet(getNumeric); GetVisitor.prototype.visitFloat64 = wrapGet(getNumeric); GetVisitor.prototype.visitUtf8 = wrapGet(getUtf8); -GetVisitor.prototype.visitLargeUtf8 = wrapGet(getLargeUtf8); +GetVisitor.prototype.visitLargeUtf8 = wrapGet(getUtf8); GetVisitor.prototype.visitBinary = wrapGet(getBinary); GetVisitor.prototype.visitFixedSizeBinary = wrapGet(getFixedSizeBinary); GetVisitor.prototype.visitDate = wrapGet(getDate); diff --git a/js/src/visitor/set.ts b/js/src/visitor/set.ts index a439ec8311fd6..15b0721660f55 100644 --- a/js/src/visitor/set.ts +++ b/js/src/visitor/set.ts @@ -125,16 +125,7 @@ export const setEpochMsToNanosecondsLong = (data: Int32Array, index: number, epo }; /** @ignore */ -export const setVariableWidthBytes = (values: Uint8Array, valueOffsets: T, index: number, value: Uint8Array) => { - if (index + 1 < valueOffsets.length) { - const x = valueOffsets[index]; - const y = valueOffsets[index + 1]; - values.set(value.subarray(0, y - x), x); - } -}; - -/** @ignore */ -export const setLargeVariableWidthBytes = (values: Uint8Array, valueOffsets: T, index: number, value: Uint8Array) => { +export const setVariableWidthBytes = (values: Uint8Array, valueOffsets: T, index: number, value: Uint8Array) => { if (index + 1 < valueOffsets.length) { const x = bigIntToNumber(valueOffsets[index]); const y = bigIntToNumber(valueOffsets[index + 1]); @@ -176,13 +167,9 @@ export const setFixedSizeBinary = ({ stride, values } /** @ignore */ const setBinary = ({ values, valueOffsets }: Data, index: number, value: T['TValue']) => setVariableWidthBytes(values, valueOffsets, index, value); /** @ignore */ -const setUtf8 = ({ values, valueOffsets }: Data, index: number, value: T['TValue']) => { +const setUtf8 = ({ values, valueOffsets }: Data, index: number, value: T['TValue']) => { setVariableWidthBytes(values, valueOffsets, index, encodeUtf8(value)); }; -/** @ignore */ -const setLargeUtf8 = ({ values, valueOffsets }: Data, index: number, value: T['TValue']) => { - setLargeVariableWidthBytes(values, valueOffsets, index, encodeUtf8(value)); -}; /* istanbul ignore next */ export const setDate = (data: Data, index: number, value: T['TValue']): void => { @@ -381,7 +368,7 @@ SetVisitor.prototype.visitFloat16 = wrapSet(setFloat16); SetVisitor.prototype.visitFloat32 = wrapSet(setFloat); SetVisitor.prototype.visitFloat64 = wrapSet(setFloat); SetVisitor.prototype.visitUtf8 = wrapSet(setUtf8); -SetVisitor.prototype.visitLargeUtf8 = wrapSet(setLargeUtf8); +SetVisitor.prototype.visitLargeUtf8 = wrapSet(setUtf8); SetVisitor.prototype.visitBinary = wrapSet(setBinary); SetVisitor.prototype.visitFixedSizeBinary = wrapSet(setFixedSizeBinary); SetVisitor.prototype.visitDate = wrapSet(setDate); diff --git a/js/src/visitor/vectorassembler.ts b/js/src/visitor/vectorassembler.ts index 7a9d3bdd57b0d..df820e6f5e00c 100644 --- a/js/src/visitor/vectorassembler.ts +++ b/js/src/visitor/vectorassembler.ts @@ -42,6 +42,7 @@ export interface VectorAssembler extends Visitor { visitInt(data: Data): this; visitFloat(data: Data): this; visitUtf8(data: Data): this; + visitLargeUtf8(data: Data): this; visitBinary(data: Data): this; visitFixedSizeBinary(data: Data): this; visitDate(data: Data): this; @@ -202,29 +203,10 @@ function assembleFlatVector(this: VectorAssembler, data: Data) { - const { length, values, valueOffsets } = data; - const { [0]: begin, [length]: end } = valueOffsets; - return _assembleFlatListVector.call(this, length, begin, end, values, valueOffsets); -} - -/** @ignore */ -function assembleLargeFlatListVector(this: VectorAssembler, data: Data) { +function assembleFlatListVector(this: VectorAssembler, data: Data) { const { length, values, valueOffsets } = data; const begin = bigIntToNumber(valueOffsets[0]); const end = bigIntToNumber(valueOffsets[length]); - return _assembleFlatListVector.call(this, length, begin, end, values, valueOffsets); -} - -/** @ignore */ -function _assembleFlatListVector( - this: VectorAssembler, - length: number, - begin: number, - end: number, - values: T['TArray'], - valueOffsets: T['TOffsetArray'] -) { const byteLength = Math.min(end - begin, values.byteLength - begin); // Push in the order FlatList types read their buffers addBuffer.call(this, rebaseValueOffsets(-begin, length + 1, valueOffsets as any)); // valueOffsets buffer first @@ -255,7 +237,7 @@ VectorAssembler.prototype.visitBool = assembleBoolVector; VectorAssembler.prototype.visitInt = assembleFlatVector; VectorAssembler.prototype.visitFloat = assembleFlatVector; VectorAssembler.prototype.visitUtf8 = assembleFlatListVector; -VectorAssembler.prototype.visitLargeUtf8 = assembleLargeFlatListVector; +VectorAssembler.prototype.visitLargeUtf8 = assembleFlatListVector; VectorAssembler.prototype.visitBinary = assembleFlatListVector; VectorAssembler.prototype.visitFixedSizeBinary = assembleFlatVector; VectorAssembler.prototype.visitDate = assembleFlatVector; From 4ec654497bb14e7ec0fbaead655c129ca61074ff Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Sun, 17 Dec 2023 22:27:05 -0500 Subject: [PATCH 057/570] GH-39257: [JS] LargeBinary (#39258) Merge after #39249 * Closes: #39257 --- docs/source/status.rst | 2 +- js/src/Arrow.dom.ts | 4 +- js/src/Arrow.ts | 3 +- js/src/builder.ts | 6 +-- js/src/builder/largebinary.ts | 54 ++++++++++++++++++++++++++ js/src/builder/largeutf8.ts | 22 ++--------- js/src/data.ts | 15 ++++++- js/src/enum.ts | 3 +- js/src/interfaces.ts | 4 ++ js/src/ipc/metadata/json.ts | 3 +- js/src/ipc/metadata/message.ts | 3 +- js/src/ipc/writer.ts | 5 +-- js/src/type.ts | 18 ++++++++- js/src/visitor.ts | 6 ++- js/src/visitor/builderctor.ts | 2 + js/src/visitor/bytelength.ts | 21 +++++----- js/src/visitor/get.ts | 8 ++-- js/src/visitor/indexof.ts | 4 +- js/src/visitor/iterator.ts | 4 +- js/src/visitor/jsontypeassembler.ts | 3 ++ js/src/visitor/jsonvectorassembler.ts | 10 +++-- js/src/visitor/set.ts | 10 ++--- js/src/visitor/typeassembler.ts | 5 +++ js/src/visitor/typecomparator.ts | 4 +- js/src/visitor/typector.ts | 1 + js/src/visitor/vectorassembler.ts | 6 ++- js/src/visitor/vectorloader.ts | 5 ++- js/test/data/tables.ts | 2 +- js/test/generate-test-data.ts | 18 ++++++++- js/test/unit/builders/builder-tests.ts | 1 + js/test/unit/generated-data-tests.ts | 1 + js/test/unit/visitor-tests.ts | 6 ++- 32 files changed, 191 insertions(+), 68 deletions(-) create mode 100644 js/src/builder/largebinary.ts diff --git a/docs/source/status.rst b/docs/source/status.rst index e52e4e4cd49bc..e860aceb76e15 100644 --- a/docs/source/status.rst +++ b/docs/source/status.rst @@ -62,7 +62,7 @@ Data Types +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Binary | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| Large Binary | ✓ | ✓ | ✓ | | | ✓ | ✓ | | +| Large Binary | ✓ | ✓ | ✓ | ✓ | | ✓ | ✓ | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Utf8 | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ diff --git a/js/src/Arrow.dom.ts b/js/src/Arrow.dom.ts index 9ec76fdd009f3..cdb4171162f63 100644 --- a/js/src/Arrow.dom.ts +++ b/js/src/Arrow.dom.ts @@ -48,7 +48,7 @@ export { Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64, Float, Float16, Float32, Float64, Utf8, LargeUtf8, - Binary, + Binary, LargeBinary, FixedSizeBinary, Date_, DateDay, DateMillisecond, Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, @@ -78,7 +78,7 @@ export { } from './Arrow.js'; export { - BinaryBuilder, + BinaryBuilder, LargeBinaryBuilder, BoolBuilder, DateBuilder, DateDayBuilder, DateMillisecondBuilder, DecimalBuilder, diff --git a/js/src/Arrow.ts b/js/src/Arrow.ts index b7e5f63a6ab5a..6251a9e77717b 100644 --- a/js/src/Arrow.ts +++ b/js/src/Arrow.ts @@ -37,7 +37,7 @@ export { Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64, Float, Float16, Float32, Float64, Utf8, LargeUtf8, - Binary, + Binary, LargeBinary, FixedSizeBinary, Date_, DateDay, DateMillisecond, Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, @@ -80,6 +80,7 @@ export { DurationBuilder, DurationSecondBuilder, DurationMillisecondBuilder, Dur export { Utf8Builder } from './builder/utf8.js'; export { LargeUtf8Builder } from './builder/largeutf8.js'; export { BinaryBuilder } from './builder/binary.js'; +export { LargeBinaryBuilder } from './builder/largebinary.js'; export { ListBuilder } from './builder/list.js'; export { FixedSizeListBuilder } from './builder/fixedsizelist.js'; export { MapBuilder } from './builder/map.js'; diff --git a/js/src/builder.ts b/js/src/builder.ts index 1a4c52f871bbf..a4e2d4d89325c 100644 --- a/js/src/builder.ts +++ b/js/src/builder.ts @@ -22,7 +22,7 @@ import { DataType, strideForType, Float, Int, Decimal, FixedSizeBinary, Date_, Time, Timestamp, Interval, Duration, - Utf8, LargeUtf8, Binary, List, Map_, + Utf8, LargeUtf8, Binary, LargeBinary, List, Map_, } from './type.js'; import { createIsValidFunction } from './builder/valid.js'; import { BufferBuilder, BitmapBufferBuilder, DataBufferBuilder, OffsetsBufferBuilder } from './builder/buffer.js'; @@ -285,7 +285,7 @@ export abstract class Builder { if (typeIds = _typeIds?.flush(length)) { // Unions, DenseUnions valueOffsets = _offsets?.flush(length); - } else if (valueOffsets = _offsets?.flush(length)) { // Variable-width primitives (Binary, Utf8, LargeUtf8), and Lists + } else if (valueOffsets = _offsets?.flush(length)) { // Variable-width primitives (Binary, LargeBinary, Utf8, LargeUtf8), and Lists data = _values?.flush(_offsets.last()); } else { // Fixed-width primitives (Int, Float, Decimal, Time, Timestamp, Duration and Interval) data = _values?.flush(length); @@ -352,7 +352,7 @@ export abstract class FixedWidthBuilder extends Builder { +export abstract class VariableWidthBuilder extends Builder { protected _pendingLength = 0; protected _offsets: OffsetsBufferBuilder; protected _pending: Map | undefined; diff --git a/js/src/builder/largebinary.ts b/js/src/builder/largebinary.ts new file mode 100644 index 0000000000000..59aa7144d20a1 --- /dev/null +++ b/js/src/builder/largebinary.ts @@ -0,0 +1,54 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { LargeBinary } from '../type.js'; +import { toUint8Array } from '../util/buffer.js'; +import { BufferBuilder } from './buffer.js'; +import { VariableWidthBuilder, BuilderOptions } from '../builder.js'; + +/** @ignore */ +export class LargeBinaryBuilder extends VariableWidthBuilder { + constructor(opts: BuilderOptions) { + super(opts); + this._values = new BufferBuilder(new Uint8Array(0)); + } + public get byteLength(): number { + let size = this._pendingLength + (this.length * 4); + this._offsets && (size += this._offsets.byteLength); + this._values && (size += this._values.byteLength); + this._nulls && (size += this._nulls.byteLength); + return size; + } + public setValue(index: number, value: Uint8Array) { + return super.setValue(index, toUint8Array(value)); + } + protected _flushPending(pending: Map, pendingLength: number) { + const offsets = this._offsets; + const data = this._values.reserve(pendingLength).buffer; + let offset = 0; + for (const [index, value] of pending) { + if (value === undefined) { + offsets.set(index, BigInt(0)); + } else { + const length = value.length; + data.set(value, offset); + offsets.set(index, BigInt(length)); + offset += length; + } + } + } +} diff --git a/js/src/builder/largeutf8.ts b/js/src/builder/largeutf8.ts index fddfeaf8e7b17..51890100095c1 100644 --- a/js/src/builder/largeutf8.ts +++ b/js/src/builder/largeutf8.ts @@ -19,6 +19,7 @@ import { LargeUtf8 } from '../type.js'; import { encodeUtf8 } from '../util/utf8.js'; import { BufferBuilder } from './buffer.js'; import { VariableWidthBuilder, BuilderOptions } from '../builder.js'; +import { LargeBinaryBuilder } from './largebinary.js'; /** @ignore */ export class LargeUtf8Builder extends VariableWidthBuilder { @@ -36,24 +37,9 @@ export class LargeUtf8Builder extends VariableWidthBuilder, pendingLength: number): void { } - protected _flushPending(pending: Map, pendingLength: number) { - const offsets = this._offsets; - const data = this._values.reserve(pendingLength).buffer; - let offset = 0; - for (const [index, value] of pending) { - if (value === undefined) { - offsets.set(index, BigInt(0)); - } else { - const length = value.length; - data.set(value, offset); - offsets.set(index, BigInt(length)); - offset += length; - } - } - } + protected _flushPending(pending: Map, pendingLength: number): void { } } -// (LargeUtf8Builder.prototype as any)._flushPending = (LargeBinaryBuilder.prototype as any)._flushPending; +(LargeUtf8Builder.prototype as any)._flushPending = (LargeBinaryBuilder.prototype as any)._flushPending; diff --git a/js/src/data.ts b/js/src/data.ts index 145ee9d049cb4..6f8792508858b 100644 --- a/js/src/data.ts +++ b/js/src/data.ts @@ -17,7 +17,7 @@ import { Vector } from './vector.js'; import { BufferType, Type, UnionMode } from './enum.js'; -import { DataType, LargeUtf8, strideForType } from './type.js'; +import { DataType, strideForType } from './type.js'; import { popcnt_bit_range, truncateBitmap } from './util/bit.js'; // When slicing, we do not know the null count of the sliced range without @@ -253,7 +253,7 @@ export class Data { import { Dictionary, - Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Float, Int, Date_, @@ -324,6 +324,14 @@ class MakeDataVisitor extends Visitor { const { ['length']: length = valueOffsets.length - 1, ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0 } = props; return new Data(type, offset, length, nullCount, [valueOffsets, data, nullBitmap]); } + public visitLargeBinary(props: LargeBinaryDataProps) { + const { ['type']: type, ['offset']: offset = 0 } = props; + const data = toUint8Array(props['data']); + const nullBitmap = toUint8Array(props['nullBitmap']); + const valueOffsets = toBigInt64Array(props['valueOffsets']); + const { ['length']: length = valueOffsets.length - 1, ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0 } = props; + return new Data(type, offset, length, nullCount, [valueOffsets, data, nullBitmap]); + } public visitFixedSizeBinary(props: FixedSizeBinaryDataProps) { const { ['type']: type, ['offset']: offset = 0 } = props; const nullBitmap = toUint8Array(props['nullBitmap']); @@ -444,6 +452,7 @@ interface IntervalDataProps extends DataProps_ { data?: D interface DurationDataProps extends DataProps_ { data?: DataBuffer } interface FixedSizeBinaryDataProps extends DataProps_ { data?: DataBuffer } interface BinaryDataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; data?: DataBuffer } +interface LargeBinaryDataProps extends DataProps_ { valueOffsets: LargeValueOffsetsBuffer | ValueOffsetsBuffer; data?: DataBuffer } interface Utf8DataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; data?: DataBuffer } interface LargeUtf8DataProps extends DataProps_ { valueOffsets: LargeValueOffsetsBuffer | ValueOffsetsBuffer; data?: DataBuffer } interface ListDataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; child: Data } @@ -468,6 +477,7 @@ export type DataProps = ( T extends Duration /* */ ? DurationDataProps : T extends FixedSizeBinary /* */ ? FixedSizeBinaryDataProps : T extends Binary /* */ ? BinaryDataProps : + T extends LargeBinary /* */ ? LargeBinaryDataProps : T extends Utf8 /* */ ? Utf8DataProps : T extends LargeUtf8 /* */ ? LargeUtf8DataProps : T extends List /* */ ? ListDataProps : @@ -495,6 +505,7 @@ export function makeData(props: IntervalDataProps): Data< export function makeData(props: DurationDataProps): Data; export function makeData(props: FixedSizeBinaryDataProps): Data; export function makeData(props: BinaryDataProps): Data; +export function makeData(props: LargeBinaryDataProps): Data; export function makeData(props: Utf8DataProps): Data; export function makeData(props: LargeUtf8DataProps): Data; export function makeData(props: ListDataProps): Data; diff --git a/js/src/enum.ts b/js/src/enum.ts index 764ea64e63338..0eecc0c68b525 100644 --- a/js/src/enum.ts +++ b/js/src/enum.ts @@ -173,7 +173,8 @@ export enum Type { FixedSizeBinary = 15, /** Fixed-size binary. Each value occupies the same number of bytes */ FixedSizeList = 16, /** Fixed-size list. Each value occupies the same number of bytes */ Map = 17, /** Map of named logical types */ - Duration = 18, /** Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds. */ + Duration = 18, /** Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds */ + LargeBinary = 19, /** Large variable-length bytes (no guarantee of UTF8-ness) */ LargeUtf8 = 20, /** Large variable-length string as List */ Dictionary = -1, /** Dictionary aka Category type */ diff --git a/js/src/interfaces.ts b/js/src/interfaces.ts index 707d01bb14cca..c4119a8bd287a 100644 --- a/js/src/interfaces.ts +++ b/js/src/interfaces.ts @@ -35,6 +35,7 @@ import type { DurationBuilder, DurationSecondBuilder, DurationMillisecondBuilder import type { Utf8Builder } from './builder/utf8.js'; import type { LargeUtf8Builder } from './builder/largeutf8.js'; import type { BinaryBuilder } from './builder/binary.js'; +import type { LargeBinaryBuilder } from './builder/largebinary.js'; import type { ListBuilder } from './builder/list.js'; import type { FixedSizeListBuilder } from './builder/fixedsizelist.js'; import type { MapBuilder } from './builder/map.js'; @@ -210,6 +211,7 @@ export type TypeToDataType = { [Type.Utf8]: type.Utf8; [Type.LargeUtf8]: type.LargeUtf8; [Type.Binary]: type.Binary; + [Type.LargeBinary]: type.LargeBinary; [Type.FixedSizeBinary]: type.FixedSizeBinary; [Type.Date]: type.Date_; [Type.DateDay]: type.DateDay; @@ -264,6 +266,7 @@ type TypeToBuilder = { [Type.Utf8]: Utf8Builder; [Type.LargeUtf8]: LargeUtf8Builder; [Type.Binary]: BinaryBuilder; + [Type.LargeBinary]: LargeBinaryBuilder; [Type.FixedSizeBinary]: FixedSizeBinaryBuilder; [Type.Date]: DateBuilder; [Type.DateDay]: DateDayBuilder; @@ -318,6 +321,7 @@ type DataTypeToBuilder = { [Type.Utf8]: T extends type.Utf8 ? Utf8Builder : never; [Type.LargeUtf8]: T extends type.LargeUtf8 ? LargeUtf8Builder : never; [Type.Binary]: T extends type.Binary ? BinaryBuilder : never; + [Type.LargeBinary]: T extends type.LargeBinary ? LargeBinaryBuilder : never; [Type.FixedSizeBinary]: T extends type.FixedSizeBinary ? FixedSizeBinaryBuilder : never; [Type.Date]: T extends type.Date_ ? DateBuilder : never; [Type.DateDay]: T extends type.DateDay ? DateDayBuilder : never; diff --git a/js/src/ipc/metadata/json.ts b/js/src/ipc/metadata/json.ts index b669c0c612f8a..8dc81ced3ffd1 100644 --- a/js/src/ipc/metadata/json.ts +++ b/js/src/ipc/metadata/json.ts @@ -20,7 +20,7 @@ import { Schema, Field } from '../../schema.js'; import { DataType, Dictionary, TimeBitWidth, - Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary, + Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Union, Bool, Null, Int, Float, Date_, Time, Interval, Timestamp, IntBitWidth, Int32, TKeys, Duration, } from '../../type.js'; @@ -149,6 +149,7 @@ function typeFromJSON(f: any, children?: Field[]): DataType { case 'NONE': return new Null(); case 'null': return new Null(); case 'binary': return new Binary(); + case 'largebinary': return new LargeBinary(); case 'utf8': return new Utf8(); case 'largeutf8': return new LargeUtf8(); case 'bool': return new Bool(); diff --git a/js/src/ipc/metadata/message.ts b/js/src/ipc/metadata/message.ts index cf05bff54cfba..552c4d846e863 100644 --- a/js/src/ipc/metadata/message.ts +++ b/js/src/ipc/metadata/message.ts @@ -56,7 +56,7 @@ import ByteBuffer = flatbuffers.ByteBuffer; import { DataType, Dictionary, TimeBitWidth, - Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary, + Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Union, Bool, Null, Int, Float, Date_, Time, Interval, Timestamp, IntBitWidth, Int32, TKeys, Duration, } from '../../type.js'; @@ -432,6 +432,7 @@ function decodeFieldType(f: _Field, children?: Field[]): DataType { case Type['NONE']: return new Null(); case Type['Null']: return new Null(); case Type['Binary']: return new Binary(); + case Type['LargeBinary']: return new LargeBinary(); case Type['Utf8']: return new Utf8(); case Type['LargeUtf8']: return new LargeUtf8(); case Type['Bool']: return new Bool(); diff --git a/js/src/ipc/writer.ts b/js/src/ipc/writer.ts index 54b4b0249e420..565b0825bd9be 100644 --- a/js/src/ipc/writer.ts +++ b/js/src/ipc/writer.ts @@ -391,7 +391,7 @@ export class RecordBatchJSONWriter extends RecordBatchW protected _writeDictionaryBatch(dictionary: Data, id: number, isDelta = false) { this._dictionaryDeltaOffsets.set(id, dictionary.length + (this._dictionaryDeltaOffsets.get(id) || 0)); this._write(this._dictionaryBlocks.length === 0 ? ` ` : `,\n `); - this._write(`${dictionaryBatchToJSON(dictionary, id, isDelta)}`); + this._write(dictionaryBatchToJSON(dictionary, id, isDelta)); this._dictionaryBlocks.push(new FileBlock(0, 0, 0)); return this; } @@ -401,7 +401,6 @@ export class RecordBatchJSONWriter extends RecordBatchW return this; } public close() { - if (this._dictionaries.length > 0) { this._write(`,\n "dictionaries": [\n`); for (const batch of this._dictionaries) { @@ -413,7 +412,7 @@ export class RecordBatchJSONWriter extends RecordBatchW if (this._recordBatches.length > 0) { for (let i = -1, n = this._recordBatches.length; ++i < n;) { this._write(i === 0 ? `,\n "batches": [\n ` : `,\n `); - this._write(`${recordBatchToJSON(this._recordBatches[i])}`); + this._write(recordBatchToJSON(this._recordBatches[i])); this._recordBatchBlocks.push(new FileBlock(0, 0, 0)); } this._write(`\n ]`); diff --git a/js/src/type.ts b/js/src/type.ts index 6223d0316f17a..dea5301aed355 100644 --- a/js/src/type.ts +++ b/js/src/type.ts @@ -58,6 +58,7 @@ export abstract class DataType { })(Binary.prototype); } +/** @ignore */ +export interface LargeBinary extends DataType { TArray: Uint8Array; TOffsetArray: BigInt64Array; TValue: Uint8Array; ArrayType: TypedArrayConstructor; OffsetArrayType: BigIntArrayConstructor } +/** @ignore */ +export class LargeBinary extends DataType { + constructor() { + super(); + } + public get typeId() { return Type.LargeBinary as Type.LargeBinary; } + public toString() { return `LargeBinary`; } + protected static [Symbol.toStringTag] = ((proto: LargeBinary) => { + (proto).ArrayType = Uint8Array; + (proto).OffsetArrayType = BigInt64Array; + return proto[Symbol.toStringTag] = 'LargeBinary'; + })(LargeBinary.prototype); +} + /** @ignore */ export interface Utf8 extends DataType { TArray: Uint8Array; TOffsetArray: Int32Array; TValue: string; ArrayType: TypedArrayConstructor; OffsetArrayType: TypedArrayConstructor } /** @ignore */ @@ -601,7 +618,6 @@ export class FixedSizeBinary extends DataType { protected static [Symbol.toStringTag] = ((proto: FixedSizeBinary) => { (proto).byteWidth = null; (proto).ArrayType = Uint8Array; - (proto).OffsetArrayType = Int32Array; return proto[Symbol.toStringTag] = 'FixedSizeBinary'; })(FixedSizeBinary.prototype); } diff --git a/js/src/visitor.ts b/js/src/visitor.ts index 5b3cc4d3d0593..2fb5e7e14bc22 100644 --- a/js/src/visitor.ts +++ b/js/src/visitor.ts @@ -38,6 +38,7 @@ export abstract class Visitor { public visitUtf8(_node: any, ..._args: any[]): any { return null; } public visitLargeUtf8(_node: any, ..._args: any[]): any { return null; } public visitBinary(_node: any, ..._args: any[]): any { return null; } + public visitLargeBinary(_node: any, ..._args: any[]): any { return null; } public visitFixedSizeBinary(_node: any, ..._args: any[]): any { return null; } public visitDate(_node: any, ..._args: any[]): any { return null; } public visitTimestamp(_node: any, ..._args: any[]): any { return null; } @@ -48,7 +49,7 @@ export abstract class Visitor { public visitUnion(_node: any, ..._args: any[]): any { return null; } public visitDictionary(_node: any, ..._args: any[]): any { return null; } public visitInterval(_node: any, ..._args: any[]): any { return null; } - public visitDuration(_node: any, ... _args: any[]): any { return null; } + public visitDuration(_node: any, ..._args: any[]): any { return null; } public visitFixedSizeList(_node: any, ..._args: any[]): any { return null; } public visitMap(_node: any, ..._args: any[]): any { return null; } } @@ -92,6 +93,7 @@ function getVisitFnByTypeId(visitor: Visitor, dtype: Type, throwIfNotFound = tru case Type.Utf8: fn = visitor.visitUtf8; break; case Type.LargeUtf8: fn = visitor.visitLargeUtf8; break; case Type.Binary: fn = visitor.visitBinary; break; + case Type.LargeBinary: fn = visitor.visitLargeBinary; break; case Type.FixedSizeBinary: fn = visitor.visitFixedSizeBinary; break; case Type.Date: fn = visitor.visitDate; break; case Type.DateDay: fn = visitor.visitDateDay || visitor.visitDate; break; @@ -153,6 +155,7 @@ function inferDType(type: T): Type { // @ts-ignore return Type.Float; case Type.Binary: return Type.Binary; + case Type.LargeBinary: return Type.LargeBinary; case Type.Utf8: return Type.Utf8; case Type.LargeUtf8: return Type.LargeUtf8; case Type.Bool: return Type.Bool; @@ -234,6 +237,7 @@ export interface Visitor { visitUtf8(node: any, ...args: any[]): any; visitLargeUtf8(node: any, ...args: any[]): any; visitBinary(node: any, ...args: any[]): any; + visitLargeBinary(node: any, ...args: any[]): any; visitFixedSizeBinary(node: any, ...args: any[]): any; visitDate(node: any, ...args: any[]): any; visitDateDay?(node: any, ...args: any[]): any; diff --git a/js/src/visitor/builderctor.ts b/js/src/visitor/builderctor.ts index 83374712b2642..5b3758c4e0cbc 100644 --- a/js/src/visitor/builderctor.ts +++ b/js/src/visitor/builderctor.ts @@ -22,6 +22,7 @@ import { DataType } from '../type.js'; import { Visitor } from '../visitor.js'; import { BuilderCtor } from '../interfaces.js'; import { BinaryBuilder } from '../builder/binary.js'; +import { LargeBinaryBuilder } from '../builder/largebinary.js'; import { BoolBuilder } from '../builder/bool.js'; import { DateBuilder, DateDayBuilder, DateMillisecondBuilder } from '../builder/date.js'; import { DecimalBuilder } from '../builder/decimal.js'; @@ -70,6 +71,7 @@ export class GetBuilderCtor extends Visitor { public visitUtf8() { return Utf8Builder; } public visitLargeUtf8() { return LargeUtf8Builder; } public visitBinary() { return BinaryBuilder; } + public visitLargeBinary() { return LargeBinaryBuilder; } public visitFixedSizeBinary() { return FixedSizeBinaryBuilder; } public visitDate() { return DateBuilder; } public visitDateDay() { return DateDayBuilder; } diff --git a/js/src/visitor/bytelength.ts b/js/src/visitor/bytelength.ts index c3bfadd50e155..43399b2571fe2 100644 --- a/js/src/visitor/bytelength.ts +++ b/js/src/visitor/bytelength.ts @@ -26,9 +26,10 @@ import { Type, TimeUnit, UnionMode } from '../enum.js'; import { DataType, Dictionary, Float, Int, Date_, Interval, Time, Timestamp, Duration, - Bool, Null, Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary, + Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Union, DenseUnion, SparseUnion, } from '../type.js'; +import { bigIntToNumber } from '../util/bigint.js'; /** @ignore */ const sum = (x: number, y: number) => x + y; @@ -39,6 +40,7 @@ export interface GetByteLengthVisitor extends Visitor { getVisitFn(node: Data | T): (data: Data, index: number) => number; getVisitFn(node: T): (data: Data>, index: number) => number; visitBinary(data: Data, index: number): number; + visitLargeBinary(data: Data, index: number): number; visitUtf8(data: Data, index: number): number; visitLargeUtf8(data: Data, index: number): number; visitList(data: Data, index: number): number; @@ -95,22 +97,15 @@ export class GetByteLengthVisitor extends Visitor { } /** @ignore */ -const getUtf8ByteLength = ({ valueOffsets }: Data, index: number): number => { +const getBinaryByteLength = ({ valueOffsets }: Data, index: number): number => { // 4 + 4 for the indices, `end - start` for the data bytes - return 8 + (valueOffsets[index + 1] - valueOffsets[index]); -}; - -/** @ignore */ -const getBinaryByteLength = ({ valueOffsets }: Data, index: number): number => { - // 4 + 4 for the indices, `end - start` for the data bytes - return 8 + (valueOffsets[index + 1] - valueOffsets[index]); + return 8 + bigIntToNumber(valueOffsets[index + 1]) - bigIntToNumber(valueOffsets[index]); }; /** @ignore */ const getListByteLength = ({ valueOffsets, stride, children }: Data, index: number): number => { const child: Data = children[0]; - const { [index * stride]: start } = valueOffsets; - const { [index * stride + 1]: end } = valueOffsets; + const { [index * stride]: start, [index * stride + 1]: end } = valueOffsets; const visit = instance.getVisitFn(child.type); const slice = child.slice(start, end - start); let size = 8; // 4 + 4 for the indices @@ -155,8 +150,10 @@ const getSparseUnionByteLength = ({ children }: Data, return 4 + instance.visitMany(children, children.map(() => index)).reduce(sum, 0); }; -GetByteLengthVisitor.prototype.visitUtf8 = getUtf8ByteLength; +GetByteLengthVisitor.prototype.visitUtf8 = getBinaryByteLength; +GetByteLengthVisitor.prototype.visitLargeUtf8 = getBinaryByteLength; GetByteLengthVisitor.prototype.visitBinary = getBinaryByteLength; +GetByteLengthVisitor.prototype.visitLargeBinary = getBinaryByteLength; GetByteLengthVisitor.prototype.visitList = getListByteLength; GetByteLengthVisitor.prototype.visitFixedSizeList = getFixedSizeListByteLength; GetByteLengthVisitor.prototype.visitUnion = getUnionByteLength; diff --git a/js/src/visitor/get.ts b/js/src/visitor/get.ts index 112d2f2983e53..3ab3bcb68c386 100644 --- a/js/src/visitor/get.ts +++ b/js/src/visitor/get.ts @@ -28,7 +28,7 @@ import { uint16ToFloat64 } from '../util/math.js'; import { Type, UnionMode, Precision, DateUnit, TimeUnit, IntervalUnit } from '../enum.js'; import { DataType, Dictionary, - Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -36,7 +36,7 @@ import { Time, TimeSecond, TimeMillisecond, TimeMicrosecond, TimeNanosecond, Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, Duration, DurationSecond, DurationMillisecond, DurationMicrosecond, DurationNanosecond, - Union, DenseUnion, SparseUnion, LargeUtf8, + Union, DenseUnion, SparseUnion, } from '../type.js'; /** @ignore */ @@ -63,6 +63,7 @@ export interface GetVisitor extends Visitor { visitUtf8(data: Data, index: number): T['TValue'] | null; visitLargeUtf8(data: Data, index: number): T['TValue'] | null; visitBinary(data: Data, index: number): T['TValue'] | null; + visitLargeBinary(data: Data, index: number): T['TValue'] | null; visitFixedSizeBinary(data: Data, index: number): T['TValue'] | null; visitDate(data: Data, index: number): T['TValue'] | null; visitDateDay(data: Data, index: number): T['TValue'] | null; @@ -151,7 +152,7 @@ const getBigInts = ({ values }: Data, index: number): T[ const getFixedSizeBinary = ({ stride, values }: Data, index: number): T['TValue'] => values.subarray(stride * index, stride * (index + 1)); /** @ignore */ -const getBinary = ({ values, valueOffsets }: Data, index: number): T['TValue'] => getVariableWidthBytes(values, valueOffsets, index); +const getBinary = ({ values, valueOffsets }: Data, index: number): T['TValue'] => getVariableWidthBytes(values, valueOffsets, index); /** @ignore */ const getUtf8 = ({ values, valueOffsets }: Data, index: number): T['TValue'] => { const bytes = getVariableWidthBytes(values, valueOffsets, index); @@ -332,6 +333,7 @@ GetVisitor.prototype.visitFloat64 = wrapGet(getNumeric); GetVisitor.prototype.visitUtf8 = wrapGet(getUtf8); GetVisitor.prototype.visitLargeUtf8 = wrapGet(getUtf8); GetVisitor.prototype.visitBinary = wrapGet(getBinary); +GetVisitor.prototype.visitLargeBinary = wrapGet(getBinary); GetVisitor.prototype.visitFixedSizeBinary = wrapGet(getFixedSizeBinary); GetVisitor.prototype.visitDate = wrapGet(getDate); GetVisitor.prototype.visitDateDay = wrapGet(getDateDay); diff --git a/js/src/visitor/indexof.ts b/js/src/visitor/indexof.ts index 76f95788c7953..1e1cb87a9840e 100644 --- a/js/src/visitor/indexof.ts +++ b/js/src/visitor/indexof.ts @@ -24,7 +24,7 @@ import { getBool, BitIterator } from '../util/bit.js'; import { createElementComparator } from '../util/vector.js'; import { DataType, Dictionary, - Bool, Null, Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -59,6 +59,7 @@ export interface IndexOfVisitor extends Visitor { visitUtf8(data: Data, value: T['TValue'] | null, index?: number): number; visitLargeUtf8(data: Data, value: T['TValue'] | null, index?: number): number; visitBinary(data: Data, value: T['TValue'] | null, index?: number): number; + visitLargeBinary(data: Data, value: T['TValue'] | null, index?: number): number; visitFixedSizeBinary(data: Data, value: T['TValue'] | null, index?: number): number; visitDate(data: Data, value: T['TValue'] | null, index?: number): number; visitDateDay(data: Data, value: T['TValue'] | null, index?: number): number; @@ -175,6 +176,7 @@ IndexOfVisitor.prototype.visitFloat64 = indexOfValue; IndexOfVisitor.prototype.visitUtf8 = indexOfValue; IndexOfVisitor.prototype.visitLargeUtf8 = indexOfValue; IndexOfVisitor.prototype.visitBinary = indexOfValue; +IndexOfVisitor.prototype.visitLargeBinary = indexOfValue; IndexOfVisitor.prototype.visitFixedSizeBinary = indexOfValue; IndexOfVisitor.prototype.visitDate = indexOfValue; IndexOfVisitor.prototype.visitDateDay = indexOfValue; diff --git a/js/src/visitor/iterator.ts b/js/src/visitor/iterator.ts index 09dfcb0b565ae..bf7e9d1591b40 100644 --- a/js/src/visitor/iterator.ts +++ b/js/src/visitor/iterator.ts @@ -21,7 +21,7 @@ import { Type, Precision } from '../enum.js'; import { TypeToDataType } from '../interfaces.js'; import { DataType, Dictionary, - Bool, Null, Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -57,6 +57,7 @@ export interface IteratorVisitor extends Visitor { visitUtf8(vector: Vector): IterableIterator; visitLargeUtf8(vector: Vector): IterableIterator; visitBinary(vector: Vector): IterableIterator; + visitLargeBinary(vector: Vector): IterableIterator; visitFixedSizeBinary(vector: Vector): IterableIterator; visitDate(vector: Vector): IterableIterator; visitDateDay(vector: Vector): IterableIterator; @@ -161,6 +162,7 @@ IteratorVisitor.prototype.visitFloat64 = vectorIterator; IteratorVisitor.prototype.visitUtf8 = vectorIterator; IteratorVisitor.prototype.visitLargeUtf8 = vectorIterator; IteratorVisitor.prototype.visitBinary = vectorIterator; +IteratorVisitor.prototype.visitLargeBinary = vectorIterator; IteratorVisitor.prototype.visitFixedSizeBinary = vectorIterator; IteratorVisitor.prototype.visitDate = vectorIterator; IteratorVisitor.prototype.visitDateDay = vectorIterator; diff --git a/js/src/visitor/jsontypeassembler.ts b/js/src/visitor/jsontypeassembler.ts index a6746a858ecb4..823b1dea104c8 100644 --- a/js/src/visitor/jsontypeassembler.ts +++ b/js/src/visitor/jsontypeassembler.ts @@ -42,6 +42,9 @@ export class JSONTypeAssembler extends Visitor { public visitBinary({ typeId }: T) { return { 'name': ArrowType[typeId].toLowerCase() }; } + public visitLargeBinary({ typeId }: T) { + return { 'name': ArrowType[typeId].toLowerCase() }; + } public visitBool({ typeId }: T) { return { 'name': ArrowType[typeId].toLowerCase() }; } diff --git a/js/src/visitor/jsonvectorassembler.ts b/js/src/visitor/jsonvectorassembler.ts index 9a3cb8601a434..88699d8f168c2 100644 --- a/js/src/visitor/jsonvectorassembler.ts +++ b/js/src/visitor/jsonvectorassembler.ts @@ -27,7 +27,7 @@ import { BitIterator, getBit, getBool } from '../util/bit.js'; import { DataType, Float, Int, Date_, Interval, Time, Timestamp, Union, Duration, - Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, IntArray, LargeUtf8, + Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, IntArray, } from '../type.js'; /** @ignore */ @@ -44,6 +44,7 @@ export interface JSONVectorAssembler extends Visitor { visitUtf8(data: Data): { DATA: string[]; OFFSET: number[] }; visitLargeUtf8(data: Data): { DATA: string[]; OFFSET: string[] }; visitBinary(data: Data): { DATA: string[]; OFFSET: number[] }; + visitLargeBinary(data: Data): { DATA: string[]; OFFSET: string[] }; visitFixedSizeBinary(data: Data): { DATA: string[] }; visitDate(data: Data): { DATA: number[] }; visitTimestamp(data: Data): { DATA: string[] }; @@ -105,7 +106,10 @@ export class JSONVectorAssembler extends Visitor { return { 'DATA': [...new Vector([data])], 'OFFSET': [...bigNumsToStrings(data.valueOffsets, 2)] }; } public visitBinary(data: Data) { - return { 'DATA': [...binaryToString(new Vector([data]))], OFFSET: [...data.valueOffsets] }; + return { 'DATA': [...binaryToString(new Vector([data]))], 'OFFSET': [...data.valueOffsets] }; + } + public visitLargeBinary(data: Data) { + return { 'DATA': [...binaryToString(new Vector([data]))], 'OFFSET': [...bigNumsToStrings(data.valueOffsets, 2)] }; } public visitFixedSizeBinary(data: Data) { return { 'DATA': [...binaryToString(new Vector([data]))] }; @@ -168,7 +172,7 @@ export class JSONVectorAssembler extends Visitor { } /** @ignore */ -function* binaryToString(vector: Vector | Vector) { +function* binaryToString(vector: Vector | Vector | Vector) { for (const octets of vector as Iterable) { yield octets.reduce((str, byte) => { return `${str}${('0' + (byte & 0xFF).toString(16)).slice(-2)}`; diff --git a/js/src/visitor/set.ts b/js/src/visitor/set.ts index 15b0721660f55..eb1f280964c8e 100644 --- a/js/src/visitor/set.ts +++ b/js/src/visitor/set.ts @@ -26,7 +26,7 @@ import { float64ToUint16 } from '../util/math.js'; import { Type, UnionMode, Precision, DateUnit, TimeUnit, IntervalUnit } from '../enum.js'; import { DataType, Dictionary, - Bool, Null, Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -61,6 +61,7 @@ export interface SetVisitor extends Visitor { visitUtf8(data: Data, index: number, value: T['TValue']): void; visitLargeUtf8(data: Data, index: number, value: T['TValue']): void; visitBinary(data: Data, index: number, value: T['TValue']): void; + visitLargeBinary(data: Data, index: number, value: T['TValue']): void; visitFixedSizeBinary(data: Data, index: number, value: T['TValue']): void; visitDate(data: Data, index: number, value: T['TValue']): void; visitDateDay(data: Data, index: number, value: T['TValue']): void; @@ -165,11 +166,9 @@ export const setDateMillisecond = ({ values }: Data({ stride, values }: Data, index: number, value: T['TValue']): void => { values.set(value.subarray(0, stride), stride * index); }; /** @ignore */ -const setBinary = ({ values, valueOffsets }: Data, index: number, value: T['TValue']) => setVariableWidthBytes(values, valueOffsets, index, value); +const setBinary = ({ values, valueOffsets }: Data, index: number, value: T['TValue']) => setVariableWidthBytes(values, valueOffsets, index, value); /** @ignore */ -const setUtf8 = ({ values, valueOffsets }: Data, index: number, value: T['TValue']) => { - setVariableWidthBytes(values, valueOffsets, index, encodeUtf8(value)); -}; +const setUtf8 = ({ values, valueOffsets }: Data, index: number, value: T['TValue']) => setVariableWidthBytes(values, valueOffsets, index, encodeUtf8(value)); /* istanbul ignore next */ export const setDate = (data: Data, index: number, value: T['TValue']): void => { @@ -370,6 +369,7 @@ SetVisitor.prototype.visitFloat64 = wrapSet(setFloat); SetVisitor.prototype.visitUtf8 = wrapSet(setUtf8); SetVisitor.prototype.visitLargeUtf8 = wrapSet(setUtf8); SetVisitor.prototype.visitBinary = wrapSet(setBinary); +SetVisitor.prototype.visitLargeBinary = wrapSet(setBinary); SetVisitor.prototype.visitFixedSizeBinary = wrapSet(setFixedSizeBinary); SetVisitor.prototype.visitDate = wrapSet(setDate); SetVisitor.prototype.visitDateDay = wrapSet(setDateDay); diff --git a/js/src/visitor/typeassembler.ts b/js/src/visitor/typeassembler.ts index f072714222739..169f3627a4002 100644 --- a/js/src/visitor/typeassembler.ts +++ b/js/src/visitor/typeassembler.ts @@ -25,6 +25,7 @@ import { Null } from '../fb/null.js'; import { Int } from '../fb/int.js'; import { FloatingPoint } from '../fb/floating-point.js'; import { Binary } from '../fb/binary.js'; +import { LargeBinary } from '../fb/large-binary.js'; import { Bool } from '../fb/bool.js'; import { Utf8 } from '../fb/utf8.js'; import { LargeUtf8 } from '../fb/large-utf8.js'; @@ -71,6 +72,10 @@ export class TypeAssembler extends Visitor { Binary.startBinary(b); return Binary.endBinary(b); } + public visitLargeBinary(_node: T, b: Builder) { + LargeBinary.startLargeBinary(b); + return LargeBinary.endLargeBinary(b); + } public visitBool(_node: T, b: Builder) { Bool.startBool(b); return Bool.endBool(b); diff --git a/js/src/visitor/typecomparator.ts b/js/src/visitor/typecomparator.ts index 2417dec09c6e9..a113f2ea31e8d 100644 --- a/js/src/visitor/typecomparator.ts +++ b/js/src/visitor/typecomparator.ts @@ -21,7 +21,7 @@ import { Visitor } from '../visitor.js'; import { Schema, Field } from '../schema.js'; import { DataType, TypeMap, Dictionary, - Bool, Null, Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -55,6 +55,7 @@ export interface TypeComparator extends Visitor { visitUtf8(type: T, other?: DataType | null): other is T; visitLargeUtf8(type: T, other?: DataType | null): other is T; visitBinary(type: T, other?: DataType | null): other is T; + visitLargeBinary(type: T, other?: DataType | null): other is T; visitFixedSizeBinary(type: T, other?: DataType | null): other is T; visitDate(type: T, other?: DataType | null): other is T; visitDateDay(type: T, other?: DataType | null): other is T; @@ -252,6 +253,7 @@ TypeComparator.prototype.visitFloat64 = compareFloat; TypeComparator.prototype.visitUtf8 = compareAny; TypeComparator.prototype.visitLargeUtf8 = compareAny; TypeComparator.prototype.visitBinary = compareAny; +TypeComparator.prototype.visitLargeBinary = compareAny; TypeComparator.prototype.visitFixedSizeBinary = compareFixedSizeBinary; TypeComparator.prototype.visitDate = compareDate; TypeComparator.prototype.visitDateDay = compareDate; diff --git a/js/src/visitor/typector.ts b/js/src/visitor/typector.ts index 2e0bbc4147abb..a781b5fb14fcc 100644 --- a/js/src/visitor/typector.ts +++ b/js/src/visitor/typector.ts @@ -51,6 +51,7 @@ export class GetDataTypeConstructor extends Visitor { public visitUtf8() { return type.Utf8; } public visitLargeUtf8() { return type.LargeUtf8; } public visitBinary() { return type.Binary; } + public visitLargeBinary() { return type.LargeBinary; } public visitFixedSizeBinary() { return type.FixedSizeBinary; } public visitDate() { return type.Date_; } public visitDateDay() { return type.DateDay; } diff --git a/js/src/visitor/vectorassembler.ts b/js/src/visitor/vectorassembler.ts index df820e6f5e00c..7dc3695582dd7 100644 --- a/js/src/visitor/vectorassembler.ts +++ b/js/src/visitor/vectorassembler.ts @@ -27,7 +27,7 @@ import { BufferRegion, FieldNode } from '../ipc/metadata/message.js'; import { DataType, Dictionary, Float, Int, Date_, Interval, Time, Timestamp, Union, Duration, - Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, LargeUtf8, + Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, } from '../type.js'; import { bigIntToNumber } from '../util/bigint.js'; @@ -44,6 +44,7 @@ export interface VectorAssembler extends Visitor { visitUtf8(data: Data): this; visitLargeUtf8(data: Data): this; visitBinary(data: Data): this; + visitLargeBinary(data: Data): this; visitFixedSizeBinary(data: Data): this; visitDate(data: Data): this; visitTimestamp(data: Data): this; @@ -203,7 +204,7 @@ function assembleFlatVector(this: VectorAssembler, data: Data) { +function assembleFlatListVector(this: VectorAssembler, data: Data) { const { length, values, valueOffsets } = data; const begin = bigIntToNumber(valueOffsets[0]); const end = bigIntToNumber(valueOffsets[length]); @@ -239,6 +240,7 @@ VectorAssembler.prototype.visitFloat = assembleFlatVector; VectorAssembler.prototype.visitUtf8 = assembleFlatListVector; VectorAssembler.prototype.visitLargeUtf8 = assembleFlatListVector; VectorAssembler.prototype.visitBinary = assembleFlatListVector; +VectorAssembler.prototype.visitLargeBinary = assembleFlatListVector; VectorAssembler.prototype.visitFixedSizeBinary = assembleFlatVector; VectorAssembler.prototype.visitDate = assembleFlatVector; VectorAssembler.prototype.visitTimestamp = assembleFlatVector; diff --git a/js/src/visitor/vectorloader.ts b/js/src/visitor/vectorloader.ts index 35f28f49baada..c9c016d6b463c 100644 --- a/js/src/visitor/vectorloader.ts +++ b/js/src/visitor/vectorloader.ts @@ -77,6 +77,9 @@ export class VectorLoader extends Visitor { public visitBinary(type: T, { length, nullCount } = this.nextFieldNode()) { return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), valueOffsets: this.readOffsets(type), data: this.readData(type) }); } + public visitLargeBinary(type: T, { length, nullCount } = this.nextFieldNode()) { + return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), valueOffsets: this.readOffsets(type), data: this.readData(type) }); + } public visitFixedSizeBinary(type: T, { length, nullCount } = this.nextFieldNode()) { return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), data: this.readData(type) }); } @@ -169,7 +172,7 @@ export class JSONVectorLoader extends VectorLoader { return toArrayBufferView(Uint8Array, Int64.convertArray(sources[offset] as string[])); } else if (DataType.isDecimal(type)) { return toArrayBufferView(Uint8Array, Int128.convertArray(sources[offset] as string[])); - } else if (DataType.isBinary(type) || DataType.isFixedSizeBinary(type)) { + } else if (DataType.isBinary(type) || DataType.isLargeBinary(type) || DataType.isFixedSizeBinary(type)) { return binaryDataFromJSON(sources[offset] as string[]); } else if (DataType.isBool(type)) { return packBools(sources[offset] as number[]); diff --git a/js/test/data/tables.ts b/js/test/data/tables.ts index 449cfe1fb853a..89cf93eab585b 100644 --- a/js/test/data/tables.ts +++ b/js/test/data/tables.ts @@ -27,7 +27,7 @@ const nestedVectorGeneratorNames = ['struct', 'denseUnion', 'sparseUnion', 'map' const dictionaryKeyGeneratorNames = ['int8', 'int16', 'int32', 'uint8', 'uint16', 'uint32']; const valueVectorGeneratorNames = [ 'null_', 'bool', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64', - 'float16', 'float32', 'float64', 'utf8', 'largeUtf8', 'binary', 'fixedSizeBinary', 'dateDay', 'dateMillisecond', + 'float16', 'float32', 'float64', 'utf8', 'largeUtf8', 'binary', 'largeBinary', 'fixedSizeBinary', 'dateDay', 'dateMillisecond', 'timestampSecond', 'timestampMillisecond', 'timestampMicrosecond', 'timestampNanosecond', 'timeSecond', 'timeMillisecond', 'timeMicrosecond', 'timeNanosecond', 'decimal', 'dictionary', 'intervalDayTime', 'intervalYearMonth', diff --git a/js/test/generate-test-data.ts b/js/test/generate-test-data.ts index 9d7b038331fe6..be248ad2c6ed8 100644 --- a/js/test/generate-test-data.ts +++ b/js/test/generate-test-data.ts @@ -25,7 +25,7 @@ import { Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64, Float, Float16, Float32, Float64, Utf8, LargeUtf8, - Binary, + Binary, LargeBinary, FixedSizeBinary, Date_, DateDay, DateMillisecond, Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, @@ -54,6 +54,7 @@ interface TestDataVectorGenerator extends Visitor { visit(type: T, length?: number, nullCount?: number): GeneratedVector; visit(type: T, length?: number, nullCount?: number): GeneratedVector; visit(type: T, length?: number, nullCount?: number): GeneratedVector; + visit(type: T, length?: number, nullCount?: number): GeneratedVector; visit(type: T, length?: number, nullCount?: number): GeneratedVector; visit(type: T, length?: number, nullCount?: number): GeneratedVector; visit(type: T, length?: number, nullCount?: number): GeneratedVector; @@ -78,6 +79,7 @@ interface TestDataVectorGenerator extends Visitor { visitUtf8: typeof generateUtf8; visitLargeUtf8: typeof generateLargeUtf8; visitBinary: typeof generateBinary; + visitLargeBinary: typeof generateLargeBinary; visitFixedSizeBinary: typeof generateFixedSizeBinary; visitDate: typeof generateDate; visitTimestamp: typeof generateTimestamp; @@ -104,6 +106,7 @@ TestDataVectorGenerator.prototype.visitFloat = generateFloat; TestDataVectorGenerator.prototype.visitUtf8 = generateUtf8; TestDataVectorGenerator.prototype.visitLargeUtf8 = generateLargeUtf8; TestDataVectorGenerator.prototype.visitBinary = generateBinary; +TestDataVectorGenerator.prototype.visitLargeBinary = generateLargeBinary; TestDataVectorGenerator.prototype.visitFixedSizeBinary = generateFixedSizeBinary; TestDataVectorGenerator.prototype.visitDate = generateDate; TestDataVectorGenerator.prototype.visitTimestamp = generateTimestamp; @@ -219,6 +222,7 @@ export const float64 = (length = 100, nullCount = Math.trunc(length * 0.2)) => v export const utf8 = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new Utf8(), length, nullCount); export const largeUtf8 = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new LargeUtf8(), length, nullCount); export const binary = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new Binary(), length, nullCount); +export const largeBinary = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new LargeBinary(), length, nullCount); export const fixedSizeBinary = (length = 100, nullCount = Math.trunc(length * 0.2), byteWidth = 8) => vectorGenerator.visit(new FixedSizeBinary(byteWidth), length, nullCount); export const dateDay = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new DateDay(), length, nullCount); export const dateMillisecond = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new DateMillisecond(), length, nullCount); @@ -246,7 +250,7 @@ export const fixedSizeList = (length = 100, nullCount = Math.trunc(length * 0.2) export const map = (length = 100, nullCount = Math.trunc(length * 0.2), child: Field> = defaultMapChild()) => vectorGenerator.visit(new Map_(child), length, nullCount); export const vecs = { - null_, bool, int8, int16, int32, int64, uint8, uint16, uint32, uint64, float16, float32, float64, utf8, largeUtf8, binary, fixedSizeBinary, dateDay, dateMillisecond, timestampSecond, timestampMillisecond, timestampMicrosecond, timestampNanosecond, timeSecond, timeMillisecond, timeMicrosecond, timeNanosecond, decimal, list, struct, denseUnion, sparseUnion, dictionary, intervalDayTime, intervalYearMonth, fixedSizeList, map, durationSecond, durationMillisecond, durationMicrosecond, durationNanosecond + null_, bool, int8, int16, int32, int64, uint8, uint16, uint32, uint64, float16, float32, float64, utf8, largeUtf8, binary, largeBinary, fixedSizeBinary, dateDay, dateMillisecond, timestampSecond, timestampMillisecond, timestampMicrosecond, timestampNanosecond, timeSecond, timeMillisecond, timeMicrosecond, timeNanosecond, decimal, list, struct, denseUnion, sparseUnion, dictionary, intervalDayTime, intervalYearMonth, fixedSizeList, map, durationSecond, durationMillisecond, durationMicrosecond, durationNanosecond } as { [k: string]: (...args: any[]) => any }; function generateNull(this: TestDataVectorGenerator, type: T, length = 100): GeneratedVector { @@ -368,6 +372,16 @@ function generateBinary(this: TestDataVectorGenerator, type: T return { values: () => values, vector: new Vector([makeData({ type, length, nullCount, nullBitmap, valueOffsets, data })]) }; } +function generateLargeBinary(this: TestDataVectorGenerator, type: T, length = 100, nullCount = Math.trunc(length * 0.2)): GeneratedVector { + const nullBitmap = createBitmap(length, nullCount); + const valueOffsets = createVariableWidthOffsets64(length, nullBitmap, 10, 20, nullCount != 0); + const values = [...valueOffsets.slice(1)] + .map((o, i) => isValid(nullBitmap, i) ? o - valueOffsets[i] : null) + .map((length) => length == null ? null : randomBytes(Number(length))); + const data = createVariableWidthBytes(length, nullBitmap, valueOffsets, (i) => values[i]!); + return { values: () => values, vector: new Vector([makeData({ type, length, nullCount, nullBitmap, valueOffsets, data })]) }; +} + function generateFixedSizeBinary(this: TestDataVectorGenerator, type: T, length = 100, nullCount = Math.trunc(length * 0.2)): GeneratedVector { const nullBitmap = createBitmap(length, nullCount); const data = fillRandom(Uint8Array, length * type.byteWidth); diff --git a/js/test/unit/builders/builder-tests.ts b/js/test/unit/builders/builder-tests.ts index 0137c7aa66635..4d1be9b225b08 100644 --- a/js/test/unit/builders/builder-tests.ts +++ b/js/test/unit/builders/builder-tests.ts @@ -46,6 +46,7 @@ describe('Generated Test Data', () => { describe('Utf8Builder', () => { validateBuilder(generate.utf8); }); describe('LargeUtf8Builder', () => { validateBuilder(generate.largeUtf8); }); describe('BinaryBuilder', () => { validateBuilder(generate.binary); }); + describe('LargeBinaryBuilder', () => { validateBuilder(generate.largeBinary); }); describe('FixedSizeBinaryBuilder', () => { validateBuilder(generate.fixedSizeBinary); }); describe('DateDayBuilder', () => { validateBuilder(generate.dateDay); }); describe('DateMillisecondBuilder', () => { validateBuilder(generate.dateMillisecond); }); diff --git a/js/test/unit/generated-data-tests.ts b/js/test/unit/generated-data-tests.ts index 0a06bcbab8ee0..1e26e74730a2d 100644 --- a/js/test/unit/generated-data-tests.ts +++ b/js/test/unit/generated-data-tests.ts @@ -40,6 +40,7 @@ describe('Generated Test Data', () => { describe('Utf8', () => { validateVector(generate.utf8()); }); describe('LargeUtf8', () => { validateVector(generate.largeUtf8()); }); describe('Binary', () => { validateVector(generate.binary()); }); + describe('LargeBinary', () => { validateVector(generate.largeBinary()); }); describe('FixedSizeBinary', () => { validateVector(generate.fixedSizeBinary()); }); describe('DateDay', () => { validateVector(generate.dateDay()); }); describe('DateMillisecond', () => { validateVector(generate.dateMillisecond()); }); diff --git a/js/test/unit/visitor-tests.ts b/js/test/unit/visitor-tests.ts index f78adc59f8e98..6ecb6cca33ed5 100644 --- a/js/test/unit/visitor-tests.ts +++ b/js/test/unit/visitor-tests.ts @@ -18,7 +18,7 @@ import { Field, Visitor, DataType, Dictionary, - Bool, Null, Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -38,6 +38,7 @@ class BasicVisitor extends Visitor { public visitUtf8(type: T) { return (this.type = type); } public visitLargeUtf8(type: T) { return (this.type = type); } public visitBinary(type: T) { return (this.type = type); } + public visitLargeBinary(type: T) { return (this.type = type); } public visitFixedSizeBinary(type: T) { return (this.type = type); } public visitDate(type: T) { return (this.type = type); } public visitTimestamp(type: T) { return (this.type = type); } @@ -71,6 +72,7 @@ class FeatureVisitor extends Visitor { public visitUtf8(type: T) { return (this.type = type); } public visitLargeUtf8(type: T) { return (this.type = type); } public visitBinary(type: T) { return (this.type = type); } + public visitLargeBinary(type: T) { return (this.type = type); } public visitFixedSizeBinary(type: T) { return (this.type = type); } public visitDateDay(type: T) { return (this.type = type); } public visitDateMillisecond(type: T) { return (this.type = type); } @@ -108,6 +110,7 @@ describe('Visitor', () => { test(`visits Utf8 types`, () => validateBasicVisitor(new Utf8())); test(`visits LargeUtf8 types`, () => validateBasicVisitor(new LargeUtf8())); test(`visits Binary types`, () => validateBasicVisitor(new Binary())); + test(`visits LargeBinary types`, () => validateBasicVisitor(new LargeBinary())); test(`visits FixedSizeBinary types`, () => validateBasicVisitor(new FixedSizeBinary(128))); test(`visits Date types`, () => validateBasicVisitor(new Date_(0))); test(`visits Timestamp types`, () => validateBasicVisitor(new Timestamp(0, 'UTC'))); @@ -149,6 +152,7 @@ describe('Visitor', () => { test(`visits Utf8 types`, () => validateFeatureVisitor(new Utf8())); test(`visits LargeUtf8 types`, () => validateFeatureVisitor(new LargeUtf8())); test(`visits Binary types`, () => validateFeatureVisitor(new Binary())); + test(`visits LargeBinary types`, () => validateFeatureVisitor(new LargeBinary())); test(`visits FixedSizeBinary types`, () => validateFeatureVisitor(new FixedSizeBinary(128))); test(`visits DateDay types`, () => validateFeatureVisitor(new DateDay())); test(`visits DateMillisecond types`, () => validateFeatureVisitor(new DateMillisecond())); From 81e47b20b241df100f3a24194e97a0423adc0d5e Mon Sep 17 00:00:00 2001 From: Jacob Wujciak-Jens Date: Mon, 18 Dec 2023 16:17:29 +0100 Subject: [PATCH 058/570] GH-39243:[R][CI] Remove r-arrow conda nightlies (#39244) ### Rationale for this change The jobs run for ~30h/day for basically no benefit. See issue for details. ### What changes are included in this PR? Removal of all r-arrow conda nightlies and auxiliary files. ### Are these changes tested? No. ### Are there any user-facing changes? No. * Closes: #39243 Authored-by: Jacob Wujciak-Jens Signed-off-by: Jacob Wujciak-Jens --- .../.ci_support/r/linux_64_r_base4.2.yaml | 27 ----- .../.ci_support/r/linux_64_r_base4.3.yaml | 27 ----- .../r/linux_aarch64_r_base4.2.yaml | 31 ------ .../r/linux_aarch64_r_base4.3.yaml | 31 ------ .../.ci_support/r/osx_64_r_base4.2.yaml | 27 ----- .../.ci_support/r/osx_64_r_base4.3.yaml | 27 ----- .../.ci_support/r/osx_arm64_r_base4.2.yaml | 27 ----- .../.ci_support/r/osx_arm64_r_base4.3.yaml | 27 ----- .../conda-recipes/.ci_support/r/win_64_.yaml | 12 -- dev/tasks/conda-recipes/r-arrow/bld.bat | 14 --- dev/tasks/conda-recipes/r-arrow/build.sh | 14 --- dev/tasks/conda-recipes/r-arrow/build_win.sh | 7 -- dev/tasks/conda-recipes/r-arrow/configure.win | 8 -- .../conda-recipes/r-arrow/install.libs.R | 5 - dev/tasks/conda-recipes/r-arrow/meta.yaml | 73 ------------ dev/tasks/tasks.yml | 104 +----------------- 16 files changed, 3 insertions(+), 458 deletions(-) delete mode 100644 dev/tasks/conda-recipes/.ci_support/r/linux_64_r_base4.2.yaml delete mode 100644 dev/tasks/conda-recipes/.ci_support/r/linux_64_r_base4.3.yaml delete mode 100644 dev/tasks/conda-recipes/.ci_support/r/linux_aarch64_r_base4.2.yaml delete mode 100644 dev/tasks/conda-recipes/.ci_support/r/linux_aarch64_r_base4.3.yaml delete mode 100644 dev/tasks/conda-recipes/.ci_support/r/osx_64_r_base4.2.yaml delete mode 100644 dev/tasks/conda-recipes/.ci_support/r/osx_64_r_base4.3.yaml delete mode 100644 dev/tasks/conda-recipes/.ci_support/r/osx_arm64_r_base4.2.yaml delete mode 100644 dev/tasks/conda-recipes/.ci_support/r/osx_arm64_r_base4.3.yaml delete mode 100644 dev/tasks/conda-recipes/.ci_support/r/win_64_.yaml delete mode 100644 dev/tasks/conda-recipes/r-arrow/bld.bat delete mode 100755 dev/tasks/conda-recipes/r-arrow/build.sh delete mode 100755 dev/tasks/conda-recipes/r-arrow/build_win.sh delete mode 100755 dev/tasks/conda-recipes/r-arrow/configure.win delete mode 100644 dev/tasks/conda-recipes/r-arrow/install.libs.R delete mode 100644 dev/tasks/conda-recipes/r-arrow/meta.yaml diff --git a/dev/tasks/conda-recipes/.ci_support/r/linux_64_r_base4.2.yaml b/dev/tasks/conda-recipes/.ci_support/r/linux_64_r_base4.2.yaml deleted file mode 100644 index 6e661e1357d22..0000000000000 --- a/dev/tasks/conda-recipes/.ci_support/r/linux_64_r_base4.2.yaml +++ /dev/null @@ -1,27 +0,0 @@ -c_compiler: -- gcc -c_compiler_version: -- '12' -cdt_name: -- cos6 -channel_sources: -- conda-forge -channel_targets: -- conda-forge main -cxx_compiler: -- gxx -cxx_compiler_version: -- '12' -docker_image: -- quay.io/condaforge/linux-anvil-cos7-x86_64 -pin_run_as_build: - r-base: - min_pin: x.x - max_pin: x.x -r_base: -- '4.2' -target_platform: -- linux-64 -zip_keys: -- - c_compiler_version - - cxx_compiler_version diff --git a/dev/tasks/conda-recipes/.ci_support/r/linux_64_r_base4.3.yaml b/dev/tasks/conda-recipes/.ci_support/r/linux_64_r_base4.3.yaml deleted file mode 100644 index a4d06c9f20cdd..0000000000000 --- a/dev/tasks/conda-recipes/.ci_support/r/linux_64_r_base4.3.yaml +++ /dev/null @@ -1,27 +0,0 @@ -c_compiler: -- gcc -c_compiler_version: -- '12' -cdt_name: -- cos6 -channel_sources: -- conda-forge -channel_targets: -- conda-forge main -cxx_compiler: -- gxx -cxx_compiler_version: -- '12' -docker_image: -- quay.io/condaforge/linux-anvil-cos7-x86_64 -pin_run_as_build: - r-base: - min_pin: x.x - max_pin: x.x -r_base: -- '4.3' -target_platform: -- linux-64 -zip_keys: -- - c_compiler_version - - cxx_compiler_version diff --git a/dev/tasks/conda-recipes/.ci_support/r/linux_aarch64_r_base4.2.yaml b/dev/tasks/conda-recipes/.ci_support/r/linux_aarch64_r_base4.2.yaml deleted file mode 100644 index 9dcd0c34c851c..0000000000000 --- a/dev/tasks/conda-recipes/.ci_support/r/linux_aarch64_r_base4.2.yaml +++ /dev/null @@ -1,31 +0,0 @@ -BUILD: -- aarch64-conda_cos7-linux-gnu -c_compiler: -- gcc -c_compiler_version: -- '12' -cdt_arch: -- aarch64 -cdt_name: -- cos7 -channel_sources: -- conda-forge -channel_targets: -- conda-forge main -cxx_compiler: -- gxx -cxx_compiler_version: -- '12' -docker_image: -- quay.io/condaforge/linux-anvil-cos7-x86_64 -pin_run_as_build: - r-base: - min_pin: x.x - max_pin: x.x -r_base: -- '4.2' -target_platform: -- linux-aarch64 -zip_keys: -- - c_compiler_version - - cxx_compiler_version diff --git a/dev/tasks/conda-recipes/.ci_support/r/linux_aarch64_r_base4.3.yaml b/dev/tasks/conda-recipes/.ci_support/r/linux_aarch64_r_base4.3.yaml deleted file mode 100644 index 028b190bb1ef5..0000000000000 --- a/dev/tasks/conda-recipes/.ci_support/r/linux_aarch64_r_base4.3.yaml +++ /dev/null @@ -1,31 +0,0 @@ -BUILD: -- aarch64-conda_cos7-linux-gnu -c_compiler: -- gcc -c_compiler_version: -- '12' -cdt_arch: -- aarch64 -cdt_name: -- cos7 -channel_sources: -- conda-forge -channel_targets: -- conda-forge main -cxx_compiler: -- gxx -cxx_compiler_version: -- '12' -docker_image: -- quay.io/condaforge/linux-anvil-cos7-x86_64 -pin_run_as_build: - r-base: - min_pin: x.x - max_pin: x.x -r_base: -- '4.3' -target_platform: -- linux-aarch64 -zip_keys: -- - c_compiler_version - - cxx_compiler_version diff --git a/dev/tasks/conda-recipes/.ci_support/r/osx_64_r_base4.2.yaml b/dev/tasks/conda-recipes/.ci_support/r/osx_64_r_base4.2.yaml deleted file mode 100644 index 2116eaf7b8b21..0000000000000 --- a/dev/tasks/conda-recipes/.ci_support/r/osx_64_r_base4.2.yaml +++ /dev/null @@ -1,27 +0,0 @@ -MACOSX_DEPLOYMENT_TARGET: -- '10.9' -c_compiler: -- clang -c_compiler_version: -- '15' -channel_sources: -- conda-forge -channel_targets: -- conda-forge main -cxx_compiler: -- clangxx -cxx_compiler_version: -- '15' -macos_machine: -- x86_64-apple-darwin13.4.0 -pin_run_as_build: - r-base: - min_pin: x.x - max_pin: x.x -r_base: -- '4.2' -target_platform: -- osx-64 -zip_keys: -- - c_compiler_version - - cxx_compiler_version diff --git a/dev/tasks/conda-recipes/.ci_support/r/osx_64_r_base4.3.yaml b/dev/tasks/conda-recipes/.ci_support/r/osx_64_r_base4.3.yaml deleted file mode 100644 index 7b8b62d8e00bb..0000000000000 --- a/dev/tasks/conda-recipes/.ci_support/r/osx_64_r_base4.3.yaml +++ /dev/null @@ -1,27 +0,0 @@ -MACOSX_DEPLOYMENT_TARGET: -- '10.9' -c_compiler: -- clang -c_compiler_version: -- '15' -channel_sources: -- conda-forge -channel_targets: -- conda-forge main -cxx_compiler: -- clangxx -cxx_compiler_version: -- '15' -macos_machine: -- x86_64-apple-darwin13.4.0 -pin_run_as_build: - r-base: - min_pin: x.x - max_pin: x.x -r_base: -- '4.3' -target_platform: -- osx-64 -zip_keys: -- - c_compiler_version - - cxx_compiler_version diff --git a/dev/tasks/conda-recipes/.ci_support/r/osx_arm64_r_base4.2.yaml b/dev/tasks/conda-recipes/.ci_support/r/osx_arm64_r_base4.2.yaml deleted file mode 100644 index af8a07c42208e..0000000000000 --- a/dev/tasks/conda-recipes/.ci_support/r/osx_arm64_r_base4.2.yaml +++ /dev/null @@ -1,27 +0,0 @@ -MACOSX_DEPLOYMENT_TARGET: -- '11.0' -c_compiler: -- clang -c_compiler_version: -- '15' -channel_sources: -- conda-forge -channel_targets: -- conda-forge main -cxx_compiler: -- clangxx -cxx_compiler_version: -- '15' -macos_machine: -- arm64-apple-darwin20.0.0 -pin_run_as_build: - r-base: - min_pin: x.x - max_pin: x.x -r_base: -- '4.2' -target_platform: -- osx-arm64 -zip_keys: -- - c_compiler_version - - cxx_compiler_version diff --git a/dev/tasks/conda-recipes/.ci_support/r/osx_arm64_r_base4.3.yaml b/dev/tasks/conda-recipes/.ci_support/r/osx_arm64_r_base4.3.yaml deleted file mode 100644 index a8e8aab83d598..0000000000000 --- a/dev/tasks/conda-recipes/.ci_support/r/osx_arm64_r_base4.3.yaml +++ /dev/null @@ -1,27 +0,0 @@ -MACOSX_DEPLOYMENT_TARGET: -- '11.0' -c_compiler: -- clang -c_compiler_version: -- '15' -channel_sources: -- conda-forge -channel_targets: -- conda-forge main -cxx_compiler: -- clangxx -cxx_compiler_version: -- '15' -macos_machine: -- arm64-apple-darwin20.0.0 -pin_run_as_build: - r-base: - min_pin: x.x - max_pin: x.x -r_base: -- '4.3' -target_platform: -- osx-arm64 -zip_keys: -- - c_compiler_version - - cxx_compiler_version diff --git a/dev/tasks/conda-recipes/.ci_support/r/win_64_.yaml b/dev/tasks/conda-recipes/.ci_support/r/win_64_.yaml deleted file mode 100644 index 72a5bf336c156..0000000000000 --- a/dev/tasks/conda-recipes/.ci_support/r/win_64_.yaml +++ /dev/null @@ -1,12 +0,0 @@ -channel_sources: -- conda-forge -channel_targets: -- conda-forge main -pin_run_as_build: - r-base: - min_pin: x.x - max_pin: x.x -r_base: -- '4.1' -target_platform: -- win-64 diff --git a/dev/tasks/conda-recipes/r-arrow/bld.bat b/dev/tasks/conda-recipes/r-arrow/bld.bat deleted file mode 100644 index 04d59f282d84b..0000000000000 --- a/dev/tasks/conda-recipes/r-arrow/bld.bat +++ /dev/null @@ -1,14 +0,0 @@ -@echo on - -bash %RECIPE_DIR%/build_win.sh -IF %ERRORLEVEL% NEQ 0 exit 1 - -cp %RECIPE_DIR%/configure.win r -IF %ERRORLEVEL% NEQ 0 exit 1 - -cp %RECIPE_DIR%/install.libs.R r/src -IF %ERRORLEVEL% NEQ 0 exit 1 - -set "MAKEFLAGS=-j%CPU_COUNT%" -"%R%" CMD INSTALL --build r -IF %ERRORLEVEL% NEQ 0 exit 1 diff --git a/dev/tasks/conda-recipes/r-arrow/build.sh b/dev/tasks/conda-recipes/r-arrow/build.sh deleted file mode 100755 index 9f5255cbaeee3..0000000000000 --- a/dev/tasks/conda-recipes/r-arrow/build.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash -set -ex - -# arrow uses C++17 -export ARROW_R_CXXFLAGS="${ARROW_R_CXXFLAGS} -std=c++17" -export LIBARROW_BUILD=false - -if [[ "${target_platform}" == osx-* ]]; then - # See https://conda-forge.org/docs/maintainer/knowledge_base.html#newer-c-features-with-old-sdk - export ARROW_R_CXXFLAGS="${ARROW_R_CXXFLAGS} -D_LIBCPP_DISABLE_AVAILABILITY" -fi - -# ${R_ARGS} necessary to support cross-compilation -${R} CMD INSTALL --build r/. ${R_ARGS} diff --git a/dev/tasks/conda-recipes/r-arrow/build_win.sh b/dev/tasks/conda-recipes/r-arrow/build_win.sh deleted file mode 100755 index 0c9a85ce3943a..0000000000000 --- a/dev/tasks/conda-recipes/r-arrow/build_win.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -set -exuo pipefail - -# Rename arrow.dll to lib_arrow.dll to avoid conflicts with the arrow-cpp arrow.dll -sed -i -e 's/void R_init_arrow/__declspec(dllexport) void R_init_lib_arrow/g' r/src/arrowExports.cpp -sed -i -e 's/useDynLib(arrow/useDynLib(lib_arrow/g' r/NAMESPACE diff --git a/dev/tasks/conda-recipes/r-arrow/configure.win b/dev/tasks/conda-recipes/r-arrow/configure.win deleted file mode 100755 index 0fc96576bde74..0000000000000 --- a/dev/tasks/conda-recipes/r-arrow/configure.win +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/env bash - -set -euxo pipefail - -echo "PKG_CPPFLAGS=-DNDEBUG -I\"${LIBRARY_PREFIX}/include\" -I\"${PREFIX}/include\" -DARROW_R_WITH_ACERO -DARROW_R_WITH_PARQUET -DARROW_R_WITH_DATASET -DARROW_R_WITH_S3 -DARROW_R_WITH_JSON" > src/Makevars.win -echo "PKG_CXXFLAGS=\$(CXX_VISIBILITY)" >> src/Makevars.win -echo 'CXX_STD=CXX17' >> src/Makevars.win -echo "PKG_LIBS=-L\"${LIBRARY_PREFIX}/lib\" -larrow_dataset -larrow_acero -lparquet -larrow" >> src/Makevars.win diff --git a/dev/tasks/conda-recipes/r-arrow/install.libs.R b/dev/tasks/conda-recipes/r-arrow/install.libs.R deleted file mode 100644 index 005bbe16b9984..0000000000000 --- a/dev/tasks/conda-recipes/r-arrow/install.libs.R +++ /dev/null @@ -1,5 +0,0 @@ -src_dir <- file.path(R_PACKAGE_SOURCE, "src", fsep = "/") -dest_dir <- file.path(R_PACKAGE_DIR, paste0("libs", R_ARCH), fsep="/") - -dir.create(file.path(R_PACKAGE_DIR, paste0("libs", R_ARCH), fsep="/"), recursive = TRUE, showWarnings = FALSE) -file.copy(file.path(src_dir, "arrow.dll", fsep = "/"), file.path(dest_dir, "lib_arrow.dll", fsep = "/")) diff --git a/dev/tasks/conda-recipes/r-arrow/meta.yaml b/dev/tasks/conda-recipes/r-arrow/meta.yaml deleted file mode 100644 index e8b834254f41c..0000000000000 --- a/dev/tasks/conda-recipes/r-arrow/meta.yaml +++ /dev/null @@ -1,73 +0,0 @@ -{% set version = ARROW_VERSION %} -{% set posix = 'm2-' if win else '' %} -{% set native = 'm2w64-' if win else '' %} - -package: - name: r-arrow - version: {{ version|replace("-", "_") }} - -source: - path: ../../../../ - -build: - merge_build_host: true # [win] - number: 0 - rpaths: - - lib/R/lib/ - - lib/ - -requirements: - build: - - cross-r-base {{ r_base }} # [build_platform != target_platform] - - r-r6 # [build_platform != target_platform] - - r-assertthat # [build_platform != target_platform] - - r-bit64 # [build_platform != target_platform] - - r-purrr # [build_platform != target_platform] - - r-rlang # [build_platform != target_platform] - - r-tidyselect # [build_platform != target_platform] - - {{ compiler('c') }} # [not win] - - {{ compiler('cxx') }} # [not win] - - {{ compiler('r_clang') }} # [win] - - pkg-config - - {{ posix }}make - - {{ posix }}sed # [win] - - {{ posix }}coreutils # [win] - - {{ posix }}filesystem # [win] - - {{ posix }}zip # [win] - host: - # Needs to be here, otherwise merge_build_host runs into issues - - pkg-config # [win] - - libarrow {{ version }} - - r-base - - r-r6 - - r-cpp11 - - r-assertthat - - r-bit64 - - r-purrr - - r-rlang - - r-tidyselect - run: - - r-base - - r-r6 - - r-assertthat - - r-bit64 - - r-purrr - - r-rlang - - r-tidyselect - -test: - commands: - - $R -e "library('arrow'); stopifnot(arrow_with_acero(), arrow_with_dataset(), arrow_with_parquet(), arrow_with_s3())" # [not win] - - "\"%R%\" -e \"library('arrow'); stopifnot(arrow_with_acero(), arrow_with_dataset(), arrow_with_parquet(), arrow_with_s3())\"" # [win] - -about: - home: https://github.com/apache/arrow - license: Apache-2.0 - license_file: LICENSE.txt - summary: R Integration to 'Apache' 'Arrow'. - license_family: APACHE - -extra: - recipe-maintainers: - - conda-forge/r - - conda-forge/arrow-cpp diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 84c3cad6ac899..ed6ea08894f10 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -85,12 +85,6 @@ groups: r: - test*-r-* - # r-conda tasks - - conda-linux-x64-cpu-r* - - conda-linux-aarch64-cpu-r* - - conda-osx-x64-cpu-r* - - conda-osx-arm64-cpu-r* - - conda-win-x64-cpu-r* - r-binary-packages ruby: @@ -237,32 +231,12 @@ tasks: # # * On conda-forge the `pyarrow` and `arrow-cpp` packages are built in # the same feedstock as the dependency matrix is the same for them as - # Python and the OS are the main dimension. The R package `r-arrow` is - # an independent feedstock as it doesn't have the Python but the - # R dimension. + # Python and the OS are the main dimension. # * The files in `dev/tasks/conda-recipes/.ci_support/` are automatically # generated and to be synced regularly from the feedstock. We have no way # yet to generate them inside the arrow repository automatically. - - conda-linux-x64-cpu-r43: - ci: azure - template: conda-recipes/azure.linux.yml - params: - config: linux_64_cuda_compiler_versionNone - r_config: linux_64_r_base4.3 - artifacts: - - libarrow-{no_rc_version}-(h[a-z0-9]+)_0_cpu.conda - - r-arrow-{no_rc_version}-r43(h[a-z0-9]+)_0.conda - - conda-linux-x64-cpu-r42: - ci: azure - template: conda-recipes/azure.linux.yml - params: - config: linux_64_cuda_compiler_versionNone - r_config: linux_64_r_base4.2 - artifacts: - - libarrow-{no_rc_version}-(h[a-z0-9]+)_0_cpu.conda - - r-arrow-{no_rc_version}-r42(h[a-z0-9]+)_0.conda + # * We no longer run the arrow-r jobs as the feedstock is very stable and + # the complexity is mostly covered by arrow-cpp. conda-linux-x64-cpu-py3: ci: azure @@ -290,26 +264,6 @@ tasks: ########################### Conda Linux (aarch64) ########################### - conda-linux-aarch64-cpu-r43: - ci: azure - template: conda-recipes/azure.linux.yml - params: - config: linux_aarch64_cuda_compiler_versionNone - r_config: linux_aarch64_r_base4.3 - artifacts: - - libarrow-{no_rc_version}-(h[a-z0-9]+)_0_cpu.conda - - r-arrow-{no_rc_version}-r43(h[a-z0-9]+)_0.conda - - conda-linux-aarch64-cpu-r42: - ci: azure - template: conda-recipes/azure.linux.yml - params: - config: linux_aarch64_cuda_compiler_versionNone - r_config: linux_aarch64_r_base4.2 - artifacts: - - libarrow-{no_rc_version}-(h[a-z0-9]+)_0_cpu.conda - - r-arrow-{no_rc_version}-r42(h[a-z0-9]+)_0.conda - conda-linux-aarch64-cpu-py3: ci: azure template: conda-recipes/azure.linux.yml @@ -362,26 +316,6 @@ tasks: ############################## Conda OSX (x64) ############################## - conda-osx-x64-cpu-r43: - ci: azure - template: conda-recipes/azure.osx.yml - params: - config: osx_64_ - r_config: osx_64_r_base4.3 - artifacts: - - libarrow-{no_rc_version}-(h[a-z0-9]+)_0_cpu.conda - - r-arrow-{no_rc_version}-r43(h[a-z0-9]+)_0.conda - - conda-osx-x64-cpu-r42: - ci: azure - template: conda-recipes/azure.osx.yml - params: - config: osx_64_ - r_config: osx_64_r_base4.2 - artifacts: - - libarrow-{no_rc_version}-(h[a-z0-9]+)_0_cpu.conda - - r-arrow-{no_rc_version}-r42(h[a-z0-9]+)_0.conda - conda-osx-x64-cpu-py3: ci: azure template: conda-recipes/azure.osx.yml @@ -396,26 +330,6 @@ tasks: ############################# Conda OSX (arm64) ############################# - conda-osx-arm64-cpu-r43: - ci: azure - template: conda-recipes/azure.osx.yml - params: - config: osx_arm64_ - r_config: osx_arm64_r_base4.3 - artifacts: - - libarrow-{no_rc_version}-(h[a-z0-9]+)_0_cpu.conda - - r-arrow-{no_rc_version}-r43(h[a-z0-9]+)_0.conda - - conda-osx-arm64-cpu-r42: - ci: azure - template: conda-recipes/azure.osx.yml - params: - config: osx_arm64_ - r_config: osx_arm64_r_base4.2 - artifacts: - - libarrow-{no_rc_version}-(h[a-z0-9]+)_0_cpu.conda - - r-arrow-{no_rc_version}-r42(h[a-z0-9]+)_0.conda - conda-osx-arm64-cpu-py3: ci: azure template: conda-recipes/azure.osx.yml @@ -430,18 +344,6 @@ tasks: ############################## Conda Windows ################################ - conda-win-x64-cpu-r41: - ci: azure - template: conda-recipes/azure.win.yml - params: - config: win_64_cuda_compiler_versionNone - r_config: win_64_ - artifacts: - - libarrow-{no_rc_version}-(h[a-z0-9]+)_0_cpu.conda - - r-arrow-{no_rc_version}-r41(h[a-z0-9]+)_0.conda - - # conda-forge does not yet support R 4.2 on windows - conda-win-x64-cpu-py3: ci: azure template: conda-recipes/azure.win.yml From 372f0a063f4b174a1ffab5f1cd037094d8da7a8a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 19 Dec 2023 05:44:43 +0900 Subject: [PATCH 059/570] MINOR: [Java] Bump org.apache.maven.surefire:surefire-junit-platform from 3.2.2 to 3.2.3 in /java (#39280) Bumps org.apache.maven.surefire:surefire-junit-platform from 3.2.2 to 3.2.3. [![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=org.apache.maven.surefire:surefire-junit-platform&package-manager=maven&previous-version=3.2.2&new-version=3.2.3)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---

    Dependabot commands and options
    You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
    Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- java/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/pom.xml b/java/pom.xml index 86eb428ebd571..ccd8418851b02 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -452,7 +452,7 @@ org.apache.maven.surefire surefire-junit-platform - 3.2.2 + 3.2.3 From 339810bae2e5f22bc6e1d81ab5a21e038e5994fa Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 19 Dec 2023 05:46:15 +0900 Subject: [PATCH 060/570] MINOR: [C#] Bump Grpc.Tools from 2.59.0 to 2.60.0 in /csharp (#39283) Bumps [Grpc.Tools](https://github.com/grpc/grpc) from 2.59.0 to 2.60.0.
    Commits

    [![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=Grpc.Tools&package-manager=nuget&previous-version=2.59.0&new-version=2.60.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
    Dependabot commands and options
    You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
    Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- .../src/Apache.Arrow.Flight.Sql/Apache.Arrow.Flight.Sql.csproj | 2 +- csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/csharp/src/Apache.Arrow.Flight.Sql/Apache.Arrow.Flight.Sql.csproj b/csharp/src/Apache.Arrow.Flight.Sql/Apache.Arrow.Flight.Sql.csproj index f04acf3a4c7e2..47b9db2acb155 100644 --- a/csharp/src/Apache.Arrow.Flight.Sql/Apache.Arrow.Flight.Sql.csproj +++ b/csharp/src/Apache.Arrow.Flight.Sql/Apache.Arrow.Flight.Sql.csproj @@ -5,7 +5,7 @@ - + diff --git a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj index 1849bf11b7439..aae26273ac282 100644 --- a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj +++ b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj @@ -7,7 +7,7 @@ - + From 3943f744722031d2706d8ed99c1b06caf5429976 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 19 Dec 2023 06:12:51 +0900 Subject: [PATCH 061/570] MINOR: [Java] Bump org.apache.hadoop:hadoop-client-runtime from 3.3.2 to 3.3.6 in /java (#39278) Bumps org.apache.hadoop:hadoop-client-runtime from 3.3.2 to 3.3.6. [![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=org.apache.hadoop:hadoop-client-runtime&package-manager=maven&previous-version=3.3.2&new-version=3.3.6)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
    Dependabot commands and options
    You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
    Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- java/adapter/orc/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/adapter/orc/pom.xml b/java/adapter/orc/pom.xml index 72ba13ea81738..803ae5a33826f 100644 --- a/java/adapter/orc/pom.xml +++ b/java/adapter/orc/pom.xml @@ -54,7 +54,7 @@ org.apache.hadoop hadoop-client-runtime - 3.3.2 + 3.3.6 test From 01c461fa8fe668e5750a5e6b96eeb9ef4aedc858 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 19 Dec 2023 08:26:39 +0900 Subject: [PATCH 062/570] MINOR: [Java] Bump io.grpc:grpc-bom from 1.59.0 to 1.60.0 in /java (#39282) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [io.grpc:grpc-bom](https://github.com/grpc/grpc-java) from 1.59.0 to 1.60.0.
    Release notes

    Sourced from io.grpc:grpc-bom's releases.

    v1.60.0

    API Changes

    • api: Stabilize ForwardingServerBuilder, ForwardingChannelBuilder2, and ForwardingChannelBuilder. Note that ForwardingChannelBuilder is stabilized (no changes will be made to it), but immediately deprecated in favor of ForwardingChannelBuilder2. (#10586)
    • api: Deprecate ForwardingChannelBuilder.delegate(). De facto this deprecates the class itself, since all classes extending ForwardingChannelBuilder implement the delegate() method. See javadoc for details (#10587)
    • api: Changed recently-introduced LoadBalancer.acceptResolvedAddresses() to return Status instead of boolean (#10636). This is part of continued work to align the LB API cross-language and API stabilization
    • stub: Deprecate StreamObservers (#10654)
    • alts: AltsChannelBuilder now extends ForwardingChannelBuilder2 (#10587)
    • protobuf: Stabilize ProtoUtils.metadataMarshaller() (#10628)
    • protobuf-lite: ProtoLiteUtils experimental comment (#10627)

    Behavior Changes

    • core: ManagedChannels now check the address types provided by the nameResolver (for the given target) with the address types supported by the channel transport and generate an error in case of mismatch. That dramatically improves the error message when an issue occurs
    • core: When a server stream is closed due to user's code (an uncaught exception in halfClosed, messagesAvailable, onReady callback of a ServerStream's listener), the Status.UNKNOWN returned to the client will have Application error processing RPC description. Previously the description was empty. This is helpful to differentiate between server errors originated in user application, gRPC library, or even those injected by a proxy. (#10643)
    • xds: Log ORCA UNIMPLEMENTED error to subchannel logger. This removes them from the normal application logs, reducing log spam

    Improvements

    • Change the underlying implementations of RingHash, RoundRobin, WeightedRoundRobin and LeastRequest load balancers to utilize the pick first load balancer rather than directly manage subchannels. This should only be noticeable if it introduced a bug
    • core: Avoid flushing headers when the server returns a single response (#9314). This is a performance optimization to reduce the number of packets for non-streaming responses
    • util: Make grpc-core an implementation dependency. This will prevent the io.grpc.internal classes in grpc-core from being visible during compilation when depending on just grpc-util
    • netty: Implement Http2Headers.isEmpty(). This fixes compatibility with Netty 4.1.101.Final.
    • netty: Add NettyServerBuilder.maxRstFramesPerWindow(). This can be used to limit impact of Rapid Reset
    • netty: Disable huffman coding in headers (#10563). Huffman coding provides modest compression for relatively high CPU usage, especially within a data center. Rely just on the HPACK static and dynamic tables for compression, for higher performance. This only impacts header values 512 bytes or longer, as Netty already disabled Huffman for smaller values
    • alts: Improve handshake failure error message by propagating original exception (#10644)

    Bug Fixes

    • util: Remove shutdown subchannels from OD tracking (#10683). This could have caused a memory leak on a long-lived channel. But we don’t think it could be triggered with our built-in load balancing policies.

    Dependencies

    • Bump Netty to 4.1.100.Final

    Acknowledgements

    @​anthonyjpratti @​fedorka @​jpd236 @​mateusazis @​pkoenig10 @​yannickepstein @​amirhadadi

    v1.59.1

    • netty: Implement Http2Headers.isEmpty(). This fixes compatibility with Netty 4.1.101.Final.
    • netty: Add NettyServerBuilder.maxRstFramesPerWindow(). This can be used to limit impact of Rapid Reset
    • xds: Log ORCA UNIMPLEMENTED error to subchannel logger. This removes them from the normal application logs, reducing log spam
    Commits
    • eb8b1d8 Bump version to 1.60.0
    • 5b1bb8c Update README etc to reference 1.60.0
    • 9400613 all: Add grpc-inprocess
    • 69114bf inprocess: Add missing anonymous address as supported
    • 24b3ca1 core: Detect NameResolverProviders passed as Factories
    • 6c55cd0 util: Remove shutdown subchannels from OD tracking (#10683)
    • 43e98d0 netty: Add option to limit RST_STREAM rate
    • 2b65e66 netty: disable huffman coding in headers (#10563)
    • 90e76a1 Implement Http2Headers.isEmpty (#10663)
    • 0299788 util: Make grpc-core an implementation dependency
    • Additional commits viewable in compare view

    [![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=io.grpc:grpc-bom&package-manager=maven&previous-version=1.59.0&new-version=1.60.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
    Dependabot commands and options
    You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
    Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- java/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/pom.xml b/java/pom.xml index ccd8418851b02..f6dcfadb81b1e 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -34,7 +34,7 @@ 2.0.9 32.1.3-jre 4.1.100.Final - 1.59.0 + 1.60.0 3.23.1 2.16.0 2.7.1 From 0552217efa4ba0a1a1a7857a86c92278ecf129c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B0=A2=E5=A4=A9?= Date: Tue, 19 Dec 2023 07:39:02 +0800 Subject: [PATCH 063/570] GH-37199: [C++] Expose a span converter for Buffer and ArraySpan (#38027) ### Rationale for this change Convenience. We can have such a helper at the buffer and array data level. ### What changes are included in this PR? Add `Buffer::span_as`, `Buffer::mutuable_span_as` and `ArraySpan::GetSpan`. ### Are these changes tested? No, but I'm happy to add some test if needed. ### Are there any user-facing changes? Yes, new public functions. * Closes: #37199 Authored-by: jsjtxietian Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/array/data.h | 31 +++++++++++++++++++++++++++++++ cpp/src/arrow/buffer.h | 13 +++++++++++++ 2 files changed, 44 insertions(+) diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h index 4c2df8381490a..f29f164d19973 100644 --- a/cpp/src/arrow/array/data.h +++ b/cpp/src/arrow/array/data.h @@ -18,6 +18,7 @@ #pragma once #include // IWYU pragma: export +#include #include #include #include @@ -438,6 +439,36 @@ struct ARROW_EXPORT ArraySpan { return GetValues(i, this->offset); } + /// \brief Access a buffer's data as a span + /// + /// \param i The buffer index + /// \param length The required length (in number of typed values) of the requested span + /// \pre i > 0 + /// \pre length <= the length of the buffer (in number of values) that's expected for + /// this array type + /// \return A span of the requested length + template + util::span GetSpan(int i, int64_t length) const { + const int64_t buffer_length = buffers[i].size / static_cast(sizeof(T)); + assert(i > 0 && length + offset <= buffer_length); + return util::span(buffers[i].data_as() + this->offset, length); + } + + /// \brief Access a buffer's data as a span + /// + /// \param i The buffer index + /// \param length The required length (in number of typed values) of the requested span + /// \pre i > 0 + /// \pre length <= the length of the buffer (in number of values) that's expected for + /// this array type + /// \return A span of the requested length + template + util::span GetSpan(int i, int64_t length) { + const int64_t buffer_length = buffers[i].size / static_cast(sizeof(T)); + assert(i > 0 && length + offset <= buffer_length); + return util::span(buffers[i].mutable_data_as() + this->offset, length); + } + inline bool IsNull(int64_t i) const { return !IsValid(i); } inline bool IsValid(int64_t i) const { diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h index ae76550be26fc..52fd94ec1f7d4 100644 --- a/cpp/src/arrow/buffer.h +++ b/cpp/src/arrow/buffer.h @@ -30,6 +30,7 @@ #include "arrow/status.h" #include "arrow/type_fwd.h" #include "arrow/util/macros.h" +#include "arrow/util/span.h" #include "arrow/util/visibility.h" namespace arrow { @@ -233,6 +234,12 @@ class ARROW_EXPORT Buffer { return reinterpret_cast(data()); } + /// \brief Return the buffer's data as a span + template + util::span span_as() const { + return util::span(data_as(), static_cast(size() / sizeof(T))); + } + /// \brief Return a writable pointer to the buffer's data /// /// The buffer has to be a mutable CPU buffer (`is_cpu()` and `is_mutable()` @@ -260,6 +267,12 @@ class ARROW_EXPORT Buffer { return reinterpret_cast(mutable_data()); } + /// \brief Return the buffer's mutable data as a span + template + util::span mutable_span_as() const { + return util::span(mutable_data_as(), static_cast(size() / sizeof(T))); + } + /// \brief Return the device address of the buffer's data uintptr_t address() const { return reinterpret_cast(data_); } From 659b2311bc577eb4322c5772a5e90f919620bb95 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Tue, 19 Dec 2023 07:32:00 +0000 Subject: [PATCH 064/570] GH-39262: [C++][Azure][FS] Add default credential auth configuration (#39263) ### Rationale for this change Default credential is a useful auth option. ### What changes are included in this PR? Implement `AzureOptions::ConfigureDefaultCredential` plus a little bit of plumbing to go around it. Created a simple test. ### Are these changes tested? Added a simple unittest that everything initialises happily. This does not actually test a successful authentication. I think to do a real authentication with Azure we would need to run the test against real blob storage and we would need to create various identities which are non-trivial to create. Personally I think this is ok because all the complexity is abstracted away by the Azure SDK. ### Are there any user-facing changes? * Closes: #39262 Lead-authored-by: Thomas Newton Co-authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- cpp/src/arrow/filesystem/azurefs.cc | 24 ++++++++++++++++++++++-- cpp/src/arrow/filesystem/azurefs.h | 7 +++++++ cpp/src/arrow/filesystem/azurefs_test.cc | 18 ++++++------------ 3 files changed, 35 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 217885364089b..dd267aac36d35 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -18,6 +18,7 @@ #include "arrow/filesystem/azurefs.h" #include "arrow/filesystem/azurefs_internal.h" +#include #include #include @@ -61,6 +62,8 @@ bool AzureOptions::Equals(const AzureOptions& other) const { switch (credential_kind_) { case CredentialKind::kAnonymous: return true; + case CredentialKind::kTokenCredential: + return token_credential_ == other.token_credential_; case CredentialKind::kStorageSharedKeyCredential: return storage_shared_key_credential_->AccountName == other.storage_shared_key_credential_->AccountName; @@ -69,8 +72,7 @@ bool AzureOptions::Equals(const AzureOptions& other) const { return false; } -Status AzureOptions::ConfigureAccountKeyCredential(const std::string& account_name, - const std::string& account_key) { +void AzureOptions::SetUrlsForAccountName(const std::string& account_name) { if (this->backend == AzureBackend::kAzurite) { account_blob_url_ = "http://127.0.0.1:10000/" + account_name + "/"; account_dfs_url_ = "http://127.0.0.1:10000/" + account_name + "/"; @@ -78,6 +80,18 @@ Status AzureOptions::ConfigureAccountKeyCredential(const std::string& account_na account_dfs_url_ = "https://" + account_name + ".dfs.core.windows.net/"; account_blob_url_ = "https://" + account_name + ".blob.core.windows.net/"; } +} + +Status AzureOptions::ConfigureDefaultCredential(const std::string& account_name) { + AzureOptions::SetUrlsForAccountName(account_name); + credential_kind_ = CredentialKind::kTokenCredential; + token_credential_ = std::make_shared(); + return Status::OK(); +} + +Status AzureOptions::ConfigureAccountKeyCredential(const std::string& account_name, + const std::string& account_key) { + AzureOptions::SetUrlsForAccountName(account_name); credential_kind_ = CredentialKind::kStorageSharedKeyCredential; storage_shared_key_credential_ = std::make_shared(account_name, account_key); @@ -89,6 +103,9 @@ Result> AzureOptions::MakeBlobServiceC switch (credential_kind_) { case CredentialKind::kAnonymous: break; + case CredentialKind::kTokenCredential: + return std::make_unique(account_blob_url_, + token_credential_); case CredentialKind::kStorageSharedKeyCredential: return std::make_unique(account_blob_url_, storage_shared_key_credential_); @@ -101,6 +118,9 @@ AzureOptions::MakeDataLakeServiceClient() const { switch (credential_kind_) { case CredentialKind::kAnonymous: break; + case CredentialKind::kTokenCredential: + return std::make_unique(account_dfs_url_, + token_credential_); case CredentialKind::kStorageSharedKeyCredential: return std::make_unique( account_dfs_url_, storage_shared_key_credential_); diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h index 1266aa2d02b86..b2c7010ff3758 100644 --- a/cpp/src/arrow/filesystem/azurefs.h +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -70,16 +70,23 @@ struct ARROW_EXPORT AzureOptions { enum class CredentialKind { kAnonymous, + kTokenCredential, kStorageSharedKeyCredential, } credential_kind_ = CredentialKind::kAnonymous; std::shared_ptr storage_shared_key_credential_; + std::shared_ptr token_credential_; + + void SetUrlsForAccountName(const std::string& account_name); + public: AzureOptions(); ~AzureOptions(); + Status ConfigureDefaultCredential(const std::string& account_name); + Status ConfigureAccountKeyCredential(const std::string& account_name, const std::string& account_key); diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 463ff4e8daf3d..799f3992a2210 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -43,9 +43,6 @@ #include #include #include -#include -#include -#include #include #include #include @@ -266,15 +263,12 @@ class AzureHierarchicalNSEnv : public AzureEnvImpl { bool WithHierarchicalNamespace() const final { return true; } }; -// Placeholder tests -// TODO: GH-18014 Remove once a proper test is added -TEST(AzureFileSystem, InitializeCredentials) { - auto default_credential = std::make_shared(); - auto managed_identity_credential = - std::make_shared(); - auto service_principal_credential = - std::make_shared("tenant_id", "client_id", - "client_secret"); +TEST(AzureFileSystem, InitializeFilesystemWithDefaultCredential) { + AzureOptions options; + options.backend = AzureBackend::kAzurite; // Irrelevant for this test because it + // doesn't connect to the server. + ARROW_EXPECT_OK(options.ConfigureDefaultCredential("dummy-account-name")); + EXPECT_OK_AND_ASSIGN(auto default_credential_fs, AzureFileSystem::Make(options)); } TEST(AzureFileSystem, OptionsCompare) { From f5dd3d4a1c0efb7c8587287da0c536988bcd1559 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Tue, 19 Dec 2023 09:45:00 +0100 Subject: [PATCH 065/570] GH-38535: [Python] Fix S3FileSystem equals None segfault (#39276) ### Rationale for this change `S3FileSystem` equals `None` currently causes bus error. ### What changes are included in this PR? Add `not None` to `FileSystem.equals` signature. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * Closes: #38535 Authored-by: AlenkaF Signed-off-by: Joris Van den Bossche --- python/pyarrow/_fs.pyx | 2 +- python/pyarrow/tests/test_fs.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx index ef8db31bfc2f6..395f488144331 100644 --- a/python/pyarrow/_fs.pyx +++ b/python/pyarrow/_fs.pyx @@ -505,7 +505,7 @@ cdef class FileSystem(_Weakrefable): cdef inline shared_ptr[CFileSystem] unwrap(self) nogil: return self.wrapped - def equals(self, FileSystem other): + def equals(self, FileSystem other not None): """ Parameters ---------- diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 59c9c449429b3..d0fa253e314e9 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -542,6 +542,13 @@ def test_filesystem_equals(): assert SubTreeFileSystem('/base', fs0) != SubTreeFileSystem('/other', fs0) +def test_filesystem_equals_none(fs): + with pytest.raises(TypeError, match="got NoneType"): + fs.equals(None) + + assert fs is not None + + def test_subtree_filesystem(): localfs = LocalFileSystem() From 9cb78addf7fcd662de1579db9dff55bd1a420fe4 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Tue, 19 Dec 2023 09:45:41 +0100 Subject: [PATCH 066/570] GH-38683: [Python][Docs] Update docstrings for Time32Type and Time64Type (#39059) ### Rationale for this change `Time32Type` and `Time64Type` unit docs are not correctly documented. ### What changes are included in this PR? Update the docstrings for `Time32Type` and `Time64Type` `unit`. * Closes: #38683 Authored-by: AlenkaF Signed-off-by: Joris Van den Bossche --- python/pyarrow/types.pxi | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index a0ddf09d69423..912ee39f7d712 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1108,6 +1108,9 @@ cdef class Time32Type(DataType): """ Concrete class for time32 data types. + Supported time unit resolutions are 's' [second] + and 'ms' [millisecond]. + Examples -------- Create an instance of time32 type: @@ -1124,7 +1127,7 @@ cdef class Time32Type(DataType): @property def unit(self): """ - The time unit ('s', 'ms', 'us' or 'ns'). + The time unit ('s' or 'ms'). Examples -------- @@ -1140,6 +1143,9 @@ cdef class Time64Type(DataType): """ Concrete class for time64 data types. + Supported time unit resolutions are 'us' [microsecond] + and 'ns' [nanosecond]. + Examples -------- Create an instance of time64 type: @@ -1156,7 +1162,7 @@ cdef class Time64Type(DataType): @property def unit(self): """ - The time unit ('s', 'ms', 'us' or 'ns'). + The time unit ('us' or 'ns'). Examples -------- From 64fed4e047f6a7b6e1081921135afc86fdcef1e7 Mon Sep 17 00:00:00 2001 From: Abram Fleishman Date: Tue, 19 Dec 2023 01:47:47 -0800 Subject: [PATCH 067/570] GH-39191: [R] throw error when `string_replace` is passed vector of values in `pattern` (#39219) ### Rationale for this change See #39191 This PR will hopefully throw an informative error message to let the user know that while the stringr::str_replace_all function can handle a named vector of values as the pattern argument, the arrow R package implementation cannot. ### What changes are included in this PR? - [ ] add tests for passing vector to the pattern argument - [ ] add check for length > 1 to the string replace bindings ### Are these changes tested? yes (though I need help!) ### Are there any user-facing changes? yes. Hopefully the user will be alerted by an informative error message that they cannot pass a vector to the pattern argument. No breaking changes are expected. * Closes: #39191 Authored-by: Abram B. Fleishman Signed-off-by: Nic Crane --- r/R/dplyr-funcs-string.R | 7 ++++++- r/tests/testthat/test-dplyr-funcs-string.R | 17 +++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/r/R/dplyr-funcs-string.R b/r/R/dplyr-funcs-string.R index 9f3220e557f08..a21ce78edd189 100644 --- a/r/R/dplyr-funcs-string.R +++ b/r/R/dplyr-funcs-string.R @@ -58,7 +58,6 @@ get_stringr_pattern_options <- function(pattern) { } ensure_opts <- function(opts) { - # default options for the simple cases if (is.character(opts)) { opts <- list(pattern = opts, fixed = FALSE, ignore_case = FALSE) @@ -352,6 +351,12 @@ register_bindings_string_regex <- function() { # Encapsulate some common logic for sub/gsub/str_replace/str_replace_all arrow_r_string_replace_function <- function(max_replacements) { function(pattern, replacement, x, ignore.case = FALSE, fixed = FALSE) { + if (length(pattern) != 1) { + stop("`pattern` must be a length 1 character vector") + } + if (length(replacement) != 1) { + stop("`replacement` must be a length 1 character vector") + } Expression$create( ifelse(fixed && !ignore.case, "replace_substring", "replace_substring_regex"), x, diff --git a/r/tests/testthat/test-dplyr-funcs-string.R b/r/tests/testthat/test-dplyr-funcs-string.R index 411b5ae3c738a..039220b88ee00 100644 --- a/r/tests/testthat/test-dplyr-funcs-string.R +++ b/r/tests/testthat/test-dplyr-funcs-string.R @@ -425,6 +425,23 @@ test_that("sub and gsub with namespacing", { }) test_that("str_replace and str_replace_all", { + x <- Expression$field_ref("x") + + expect_error( + call_binding("str_replace_all", x, c("F" = "_", "b" = "")), + regexp = "`pattern` must be a length 1 character vector" + ) + + expect_error( + call_binding("str_replace_all", x, c("F", "b"), c("_", "")), + regexp = "`pattern` must be a length 1 character vector" + ) + + expect_error( + call_binding("str_replace_all", x, c("F"), c("_", "")), + regexp = "`replacement` must be a length 1 character vector" + ) + df <- tibble(x = c("Foo", "bar")) compare_dplyr_binding( From 419bbc4ff6a5a14af18e5d7ca3ca2de41a413bd7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Tue, 19 Dec 2023 12:08:38 +0100 Subject: [PATCH 068/570] MINOR: [Release] Update versions for 15.0.0-SNAPSHOT --- ci/scripts/PKGBUILD | 2 +- r/DESCRIPTION | 2 +- r/NEWS.md | 4 +++- r/pkgdown/assets/versions.json | 4 ++-- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD index 2cdd1d42634bf..674acc99f54a9 100644 --- a/ci/scripts/PKGBUILD +++ b/ci/scripts/PKGBUILD @@ -18,7 +18,7 @@ _realname=arrow pkgbase=mingw-w64-${_realname} pkgname="${MINGW_PACKAGE_PREFIX}-${_realname}" -pkgver=14.0.1.9000 +pkgver=14.0.2.9000 pkgrel=8000 pkgdesc="Apache Arrow is a cross-language development platform for in-memory data (mingw-w64)" arch=("any") diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 1bf25e57a3cce..b290a75f932d5 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -1,6 +1,6 @@ Package: arrow Title: Integration to 'Apache' 'Arrow' -Version: 14.0.1.9000 +Version: 14.0.2.9000 Authors@R: c( person("Neal", "Richardson", email = "neal.p.richardson@gmail.com", role = c("aut")), person("Ian", "Cook", email = "ianmcook@gmail.com", role = c("aut")), diff --git a/r/NEWS.md b/r/NEWS.md index 8515facdff871..63f12607d8d1b 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -17,7 +17,9 @@ under the License. --> -# arrow 14.0.1.9000 +# arrow 14.0.2.9000 + +# arrow 14.0.2 # arrow 14.0.0.2 diff --git a/r/pkgdown/assets/versions.json b/r/pkgdown/assets/versions.json index 88289e72004b3..35a1ef3b5ecb3 100644 --- a/r/pkgdown/assets/versions.json +++ b/r/pkgdown/assets/versions.json @@ -1,10 +1,10 @@ [ { - "name": "14.0.1.9000 (dev)", + "name": "14.0.2.9000 (dev)", "version": "dev/" }, { - "name": "14.0.1 (release)", + "name": "14.0.2 (release)", "version": "" }, { From 0479f8532d5ace54cf554e2e60aa621f06536091 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Tue, 19 Dec 2023 12:08:38 +0100 Subject: [PATCH 069/570] MINOR: [Release] Update .deb/.rpm changelogs for 14.0.2 --- .../linux-packages/apache-arrow-apt-source/debian/changelog | 6 ++++++ .../apache-arrow-release/yum/apache-arrow-release.spec.in | 3 +++ dev/tasks/linux-packages/apache-arrow/debian/changelog | 6 ++++++ dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in | 3 +++ 4 files changed, 18 insertions(+) diff --git a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog b/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog index 83a388c93051d..32a5a38afebf3 100644 --- a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog +++ b/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog @@ -1,3 +1,9 @@ +apache-arrow-apt-source (14.0.2-1) unstable; urgency=low + + * New upstream release. + + -- Raúl Cumplido Tue, 12 Dec 2023 09:31:43 -0000 + apache-arrow-apt-source (14.0.1-1) unstable; urgency=low * New upstream release. diff --git a/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in b/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in index 245e8afeaeb1d..348f8064ecc5f 100644 --- a/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in +++ b/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in @@ -102,6 +102,9 @@ else fi %changelog +* Tue Dec 12 2023 Raúl Cumplido - 14.0.2-1 +- New upstream release. + * Mon Nov 06 2023 Raúl Cumplido - 14.0.1-1 - New upstream release. diff --git a/dev/tasks/linux-packages/apache-arrow/debian/changelog b/dev/tasks/linux-packages/apache-arrow/debian/changelog index 1f3f1bd5abd07..b14bb0985893e 100644 --- a/dev/tasks/linux-packages/apache-arrow/debian/changelog +++ b/dev/tasks/linux-packages/apache-arrow/debian/changelog @@ -1,3 +1,9 @@ +apache-arrow (14.0.2-1) unstable; urgency=low + + * New upstream release. + + -- Raúl Cumplido Tue, 12 Dec 2023 09:31:43 -0000 + apache-arrow (14.0.1-1) unstable; urgency=low * New upstream release. diff --git a/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in b/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in index 87e05558e8cda..44421ce0ea1e4 100644 --- a/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in +++ b/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in @@ -864,6 +864,9 @@ Documentation for Apache Parquet GLib. %{_datadir}/gtk-doc/html/parquet-glib/ %changelog +* Tue Dec 12 2023 Raúl Cumplido - 14.0.2-1 +- New upstream release. + * Mon Nov 06 2023 Raúl Cumplido - 14.0.1-1 - New upstream release. From 56991d3efd57e610f5ab604086e19753bd8c834b Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 19 Dec 2023 10:59:50 -0300 Subject: [PATCH 070/570] GH-39292 [C++][FS]: Remove the AzureBackend enum and add more flexible connection options (#39293) ### Rationale for this change It's good to avoid mentioning the specific test environment in the implementation code. ### What changes are included in this PR? - Removal of the enum - Removal of the `AzureOptions::backend` class member - Addition of more options to `AzureOptions` - Removal of some private string members of `AzureOptions` -- the URLs are built on-the-fly when the clients are instantiated now ### Are these changes tested? Yes. ### Are there any user-facing changes? Changes to the public interface (`azurefs.h`) that won't affect users because the `AzureFS` implementation is not used yet. * Closes: #39292 Authored-by: Felipe Oliveira Carvalho Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/filesystem/azurefs.cc | 61 ++++++++++++++++-------- cpp/src/arrow/filesystem/azurefs.h | 51 +++++++++++++------- cpp/src/arrow/filesystem/azurefs_test.cc | 21 ++++++-- 3 files changed, 91 insertions(+), 42 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index dd267aac36d35..1aa3e86a6f926 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -51,10 +51,12 @@ AzureOptions::~AzureOptions() = default; bool AzureOptions::Equals(const AzureOptions& other) const { // TODO(GH-38598): update here when more auth methods are added. - const bool equals = backend == other.backend && + const bool equals = blob_storage_authority == other.blob_storage_authority && + dfs_storage_authority == other.dfs_storage_authority && + blob_storage_scheme == other.blob_storage_scheme && + dfs_storage_scheme == other.dfs_storage_scheme && default_metadata == other.default_metadata && - account_blob_url_ == other.account_blob_url_ && - account_dfs_url_ == other.account_dfs_url_ && + account_name_ == other.account_name_ && credential_kind_ == other.credential_kind_; if (!equals) { return false; @@ -72,42 +74,59 @@ bool AzureOptions::Equals(const AzureOptions& other) const { return false; } -void AzureOptions::SetUrlsForAccountName(const std::string& account_name) { - if (this->backend == AzureBackend::kAzurite) { - account_blob_url_ = "http://127.0.0.1:10000/" + account_name + "/"; - account_dfs_url_ = "http://127.0.0.1:10000/" + account_name + "/"; - } else { - account_dfs_url_ = "https://" + account_name + ".dfs.core.windows.net/"; - account_blob_url_ = "https://" + account_name + ".blob.core.windows.net/"; +namespace { +std::string BuildBaseUrl(const std::string& scheme, const std::string& authority, + const std::string& account_name) { + std::string url; + url += scheme + "://"; + if (!authority.empty()) { + if (authority[0] == '.') { + url += account_name; + url += authority; + } else { + url += authority; + url += "/"; + url += account_name; + } } + url += "/"; + return url; } +} // namespace -Status AzureOptions::ConfigureDefaultCredential(const std::string& account_name) { - AzureOptions::SetUrlsForAccountName(account_name); - credential_kind_ = CredentialKind::kTokenCredential; - token_credential_ = std::make_shared(); - return Status::OK(); +std::string AzureOptions::AccountBlobUrl(const std::string& account_name) const { + return BuildBaseUrl(blob_storage_scheme, blob_storage_authority, account_name); +} + +std::string AzureOptions::AccountDfsUrl(const std::string& account_name) const { + return BuildBaseUrl(dfs_storage_scheme, dfs_storage_authority, account_name); } Status AzureOptions::ConfigureAccountKeyCredential(const std::string& account_name, const std::string& account_key) { - AzureOptions::SetUrlsForAccountName(account_name); credential_kind_ = CredentialKind::kStorageSharedKeyCredential; + account_name_ = account_name; storage_shared_key_credential_ = std::make_shared(account_name, account_key); return Status::OK(); } +Status AzureOptions::ConfigureDefaultCredential(const std::string& account_name) { + credential_kind_ = CredentialKind::kTokenCredential; + token_credential_ = std::make_shared(); + return Status::OK(); +} + Result> AzureOptions::MakeBlobServiceClient() const { switch (credential_kind_) { case CredentialKind::kAnonymous: break; case CredentialKind::kTokenCredential: - return std::make_unique(account_blob_url_, + return std::make_unique(AccountBlobUrl(account_name_), token_credential_); case CredentialKind::kStorageSharedKeyCredential: - return std::make_unique(account_blob_url_, + return std::make_unique(AccountBlobUrl(account_name_), storage_shared_key_credential_); } return Status::Invalid("AzureOptions doesn't contain a valid auth configuration"); @@ -119,11 +138,11 @@ AzureOptions::MakeDataLakeServiceClient() const { case CredentialKind::kAnonymous: break; case CredentialKind::kTokenCredential: - return std::make_unique(account_dfs_url_, - token_credential_); + return std::make_unique( + AccountDfsUrl(account_name_), token_credential_); case CredentialKind::kStorageSharedKeyCredential: return std::make_unique( - account_dfs_url_, storage_shared_key_credential_); + AccountDfsUrl(account_name_), storage_shared_key_credential_); } return Status::Invalid("AzureOptions doesn't contain a valid auth configuration"); } diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h index b2c7010ff3758..35c140b1097c7 100644 --- a/cpp/src/arrow/filesystem/azurefs.h +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -43,17 +43,37 @@ class DataLakeServiceClient; namespace arrow::fs { -enum class AzureBackend { - /// \brief Official Azure Remote Backend - kAzure, - /// \brief Local Simulated Storage - kAzurite -}; - /// Options for the AzureFileSystem implementation. struct ARROW_EXPORT AzureOptions { - /// \brief The backend to connect to: Azure or Azurite (for testing). - AzureBackend backend = AzureBackend::kAzure; + /// \brief hostname[:port] of the Azure Blob Storage Service. + /// + /// If the hostname is a relative domain name (one that starts with a '.'), then storage + /// account URLs will be constructed by prepending the account name to the hostname. + /// If the hostname is a fully qualified domain name, then the hostname will be used + /// as-is and the account name will follow the hostname in the URL path. + /// + /// Default: ".blob.core.windows.net" + std::string blob_storage_authority = ".blob.core.windows.net"; + + /// \brief hostname[:port] of the Azure Data Lake Storage Gen 2 Service. + /// + /// If the hostname is a relative domain name (one that starts with a '.'), then storage + /// account URLs will be constructed by prepending the account name to the hostname. + /// If the hostname is a fully qualified domain name, then the hostname will be used + /// as-is and the account name will follow the hostname in the URL path. + /// + /// Default: ".dfs.core.windows.net" + std::string dfs_storage_authority = ".dfs.core.windows.net"; + + /// \brief Azure Blob Storage connection transport. + /// + /// Default: "https" + std::string blob_storage_scheme = "https"; + + /// \brief Azure Data Lake Storage Gen 2 connection transport. + /// + /// Default: "https" + std::string dfs_storage_scheme = "https"; // TODO(GH-38598): Add support for more auth methods. // std::string connection_string; @@ -65,22 +85,17 @@ struct ARROW_EXPORT AzureOptions { std::shared_ptr default_metadata; private: - std::string account_blob_url_; - std::string account_dfs_url_; - enum class CredentialKind { kAnonymous, kTokenCredential, kStorageSharedKeyCredential, } credential_kind_ = CredentialKind::kAnonymous; + std::string account_name_; + std::shared_ptr token_credential_; std::shared_ptr storage_shared_key_credential_; - std::shared_ptr token_credential_; - - void SetUrlsForAccountName(const std::string& account_name); - public: AzureOptions(); ~AzureOptions(); @@ -92,8 +107,8 @@ struct ARROW_EXPORT AzureOptions { bool Equals(const AzureOptions& other) const; - const std::string& AccountBlobUrl() const { return account_blob_url_; } - const std::string& AccountDfsUrl() const { return account_dfs_url_; } + std::string AccountBlobUrl(const std::string& account_name) const; + std::string AccountDfsUrl(const std::string& account_name) const; Result> MakeBlobServiceClient() const; diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 799f3992a2210..8a39c4c554897 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -73,6 +73,13 @@ namespace Blobs = Azure::Storage::Blobs; namespace Core = Azure::Core; namespace DataLake = Azure::Storage::Files::DataLake; +enum class AzureBackend { + /// \brief Official Azure Remote Backend + kAzure, + /// \brief Local Simulated Storage + kAzurite +}; + class BaseAzureEnv : public ::testing::Environment { protected: std::string account_name_; @@ -265,8 +272,6 @@ class AzureHierarchicalNSEnv : public AzureEnvImpl { TEST(AzureFileSystem, InitializeFilesystemWithDefaultCredential) { AzureOptions options; - options.backend = AzureBackend::kAzurite; // Irrelevant for this test because it - // doesn't connect to the server. ARROW_EXPECT_OK(options.ConfigureDefaultCredential("dummy-account-name")); EXPECT_OK_AND_ASSIGN(auto default_credential_fs, AzureFileSystem::Make(options)); } @@ -352,7 +357,17 @@ class TestAzureFileSystem : public ::testing::Test { static Result MakeOptions(BaseAzureEnv* env) { AzureOptions options; - options.backend = env->backend(); + switch (env->backend()) { + case AzureBackend::kAzurite: + options.blob_storage_authority = "127.0.0.1:10000"; + options.dfs_storage_authority = "127.0.0.1:10000"; + options.blob_storage_scheme = "http"; + options.dfs_storage_scheme = "http"; + break; + case AzureBackend::kAzure: + // Use the default values + break; + } ARROW_EXPECT_OK( options.ConfigureAccountKeyCredential(env->account_name(), env->account_key())); return options; From b862b164a644a92f8a802954fcad179bf28e020e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?JB=20Onofr=C3=A9?= Date: Tue, 19 Dec 2023 17:41:28 +0100 Subject: [PATCH 071/570] GH-39299: [Java] Upgrade to Avro 1.11.3 (#39300) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Upgrade to Avro 1.11.3 to fix CVE-2023-39410 ### What changes are included in this PR? Upgrade to Avro 1.11.3 ### Are these changes tested? Run local tests especially on Avro adapter ### Are there any user-facing changes? Not directly * Closes: #39299 Authored-by: JB Onofré Signed-off-by: David Li --- java/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/pom.xml b/java/pom.xml index f6dcfadb81b1e..75e0946f10811 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -39,7 +39,7 @@ 2.16.0 2.7.1 23.5.26 - 1.10.0 + 1.11.3 2 true From 3e182f2c9d5c710ce809e342f5c8cee547f979d2 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 19 Dec 2023 13:34:53 -0500 Subject: [PATCH 072/570] GH-39013: [Go][Integration] Support cABI import/export of StringView (#39019) ### Rationale for this change The Go implementation should support import/export of the new data types. This will enable integration testing between the C++ and Go implementations. ### What changes are included in this PR? Added import/export for the new data types and arrays of data of those types. ### Are these changes tested? Yes, they will be covered by the integration tests and existing Go unit tests. ### Are there any user-facing changes? This is a user facing change * Closes: #39013 Lead-authored-by: Benjamin Kietzman Co-authored-by: Matt Topol Co-authored-by: Felipe Oliveira Carvalho Signed-off-by: Matt Topol --- dev/archery/archery/integration/datagen.py | 93 +++++++++- dev/archery/archery/integration/runner.py | 2 + docs/source/format/Integration.rst | 23 ++- go/arrow/array/encoded.go | 6 +- go/arrow/array/list.go | 146 ++-------------- go/arrow/avro/reader_types.go | 2 +- go/arrow/bitutil/endian_default.go | 1 + go/arrow/bitutil/endian_s390x.go | 2 +- go/arrow/cdata/cdata.go | 90 ++++++++++ go/arrow/cdata/cdata_exports.go | 54 ++++-- go/arrow/compute/arithmetic.go | 4 +- go/arrow/compute/arithmetic_test.go | 36 ++-- go/arrow/compute/exec/utils.go | 140 +-------------- go/arrow/compute/exec/utils_test.go | 2 +- go/arrow/compute/fieldref.go | 51 +++--- .../internal/kernels/base_arithmetic.go | 12 +- .../internal/kernels/base_arithmetic_amd64.go | 55 +++--- .../kernels/basic_arithmetic_noasm.go | 3 +- .../compute/internal/kernels/boolean_cast.go | 2 +- go/arrow/compute/internal/kernels/helpers.go | 42 ++--- .../compute/internal/kernels/numeric_cast.go | 22 +-- .../internal/kernels/scalar_arithmetic.go | 6 +- .../kernels/scalar_comparison_amd64.go | 5 +- .../kernels/scalar_comparison_noasm.go | 4 +- .../internal/kernels/scalar_comparisons.go | 30 ++-- .../compute/internal/kernels/string_casts.go | 10 +- .../compute/internal/kernels/vector_hash.go | 2 +- .../internal/kernels/vector_run_end_encode.go | 24 +-- .../internal/kernels/vector_selection.go | 28 +-- go/arrow/compute/scalar_compare_test.go | 16 +- go/arrow/compute/vector_hash_test.go | 23 ++- go/arrow/compute/vector_selection_test.go | 30 ++-- go/arrow/flight/doc.go | 1 - go/arrow/flight/server.go | 2 +- go/arrow/internal/arrjson/arrjson.go | 13 +- go/arrow/internal/arrjson/arrjson_test.go | 32 ++-- go/arrow/internal/testing/tools/bits.go | 2 +- go/arrow/internal/utils.go | 12 ++ go/arrow/ipc/file_reader.go | 30 ++-- go/arrow/ipc/writer.go | 158 +++++++++-------- go/arrow/memory/util.go | 8 + go/arrow/type_traits.go | 162 ++++++++++++++++++ go/arrow/type_traits_decimal128.go | 9 +- go/arrow/type_traits_decimal256.go | 9 +- go/arrow/type_traits_float16.go | 9 +- go/arrow/type_traits_interval.go | 25 +-- go/arrow/type_traits_numeric.gen.go | 121 ++++--------- go/arrow/type_traits_numeric.gen.go.tmpl | 9 +- go/arrow/type_traits_timestamp.go | 9 +- go/arrow/type_traits_view.go | 9 +- go/internal/bitutils/bit_set_run_reader.go | 6 +- go/internal/utils/math.go | 22 +-- go/parquet/file/column_reader.go | 2 +- go/parquet/file/column_reader_test.go | 2 +- go/parquet/file/level_conversion.go | 2 +- .../internal/encoding/boolean_decoder.go | 6 +- .../internal/encoding/byte_array_decoder.go | 2 +- go/parquet/internal/encoding/decoder.go | 2 +- .../internal/encoding/delta_bit_packing.go | 4 +- .../internal/encoding/delta_byte_array.go | 2 +- .../encoding/delta_length_byte_array.go | 2 +- .../encoding/fixed_len_byte_array_decoder.go | 2 +- .../encoding/plain_encoder_types.gen.go | 10 +- .../encoding/plain_encoder_types.gen.go.tmpl | 2 +- .../internal/encoding/typed_encoder.gen.go | 28 +-- .../encoding/typed_encoder.gen.go.tmpl | 4 +- go/parquet/internal/encoding/types.go | 4 +- go/parquet/internal/testutils/pagebuilder.go | 8 +- go/parquet/internal/utils/bit_reader.go | 4 +- go/parquet/internal/utils/rle.go | 6 +- .../internal/utils/typed_rle_dict.gen.go | 56 +++--- .../internal/utils/typed_rle_dict.gen.go.tmpl | 8 +- go/parquet/pqarrow/column_readers.go | 6 +- 73 files changed, 917 insertions(+), 859 deletions(-) create mode 100644 go/arrow/type_traits.go diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index 29b203ae130c6..2bbc843836af9 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -927,6 +927,83 @@ class LargeListColumn(_BaseListColumn, _LargeOffsetsMixin): pass +class ListViewField(Field): + + def __init__(self, name, value_field, *, nullable=True, + metadata=None): + super().__init__(name, nullable=nullable, + metadata=metadata) + self.value_field = value_field + + @property + def column_class(self): + return ListViewColumn + + def _get_type(self): + return OrderedDict([ + ('name', 'listview') + ]) + + def _get_children(self): + return [self.value_field.get_json()] + + def generate_column(self, size, name=None): + MAX_LIST_SIZE = 4 + VALUES_SIZE = size * MAX_LIST_SIZE + + is_valid = self._make_is_valid(size) + + MAX_OFFSET = VALUES_SIZE - MAX_LIST_SIZE + offsets = np.random.randint(0, MAX_OFFSET + 1, size=size) + sizes = np.random.randint(0, MAX_LIST_SIZE + 1, size=size) + + values = self.value_field.generate_column(VALUES_SIZE) + + if name is None: + name = self.name + return self.column_class(name, size, is_valid, offsets, sizes, values) + + +class LargeListViewField(ListViewField): + + @property + def column_class(self): + return LargeListViewColumn + + def _get_type(self): + return OrderedDict([ + ('name', 'largelistview') + ]) + + +class _BaseListViewColumn(Column): + + def __init__(self, name, count, is_valid, offsets, sizes, values): + super().__init__(name, count) + self.is_valid = is_valid + self.offsets = offsets + self.sizes = sizes + self.values = values + + def _get_buffers(self): + return [ + ('VALIDITY', [int(v) for v in self.is_valid]), + ('OFFSET', self._encode_offsets(self.offsets)), + ('SIZE', self._encode_offsets(self.sizes)), + ] + + def _get_children(self): + return [self.values.get_json()] + + +class ListViewColumn(_BaseListViewColumn, _NarrowOffsetsMixin): + pass + + +class LargeListViewColumn(_BaseListViewColumn, _LargeOffsetsMixin): + pass + + class MapField(Field): def __init__(self, name, key_field, item_field, *, nullable=True, @@ -1663,6 +1740,15 @@ def generate_binary_view_case(): return _generate_file("binary_view", fields, batch_sizes) +def generate_list_view_case(): + fields = [ + ListViewField('lv', get_field('item', 'float32')), + LargeListViewField('llv', get_field('item', 'float32')), + ] + batch_sizes = [0, 7, 256] + return _generate_file("list_view", fields, batch_sizes) + + def generate_nested_large_offsets_case(): fields = [ LargeListField('large_list_nullable', get_field('item', 'int32')), @@ -1847,7 +1933,12 @@ def _temp_path(): generate_binary_view_case() .skip_tester('C#') - .skip_tester('Go') + .skip_tester('Java') + .skip_tester('JS') + .skip_tester('Rust'), + + generate_list_view_case() + .skip_tester('C#') .skip_tester('Java') .skip_tester('JS') .skip_tester('Rust'), diff --git a/dev/archery/archery/integration/runner.py b/dev/archery/archery/integration/runner.py index bab00e6d70d4a..7fadb7e47cf93 100644 --- a/dev/archery/archery/integration/runner.py +++ b/dev/archery/archery/integration/runner.py @@ -193,6 +193,8 @@ def _run_test_cases(self, ``case_runner`` ran against ``test_cases`` """ def case_wrapper(test_case): + if serial: + return case_runner(test_case) with printer.cork(): return case_runner(test_case) diff --git a/docs/source/format/Integration.rst b/docs/source/format/Integration.rst index e1160b287e77c..1a9b1b97f07ee 100644 --- a/docs/source/format/Integration.rst +++ b/docs/source/format/Integration.rst @@ -223,7 +223,7 @@ considered equivalent to ``[]`` (no metadata). Duplicated keys are not forbidden **Type**: :: { - "name" : "null|struct|list|largelist|fixedsizelist|union|int|floatingpoint|utf8|largeutf8|binary|largebinary|fixedsizebinary|bool|decimal|date|time|timestamp|interval|duration|map" + "name" : "null|struct|list|largelist|listview|largelistview|fixedsizelist|union|int|floatingpoint|utf8|largeutf8|binary|largebinary|utf8view|binaryview|fixedsizebinary|bool|decimal|date|time|timestamp|interval|duration|map|runendencoded" } A ``Type`` will have other fields as defined in @@ -446,12 +446,22 @@ or ``DATA``. ``BufferData`` is encoded based on the type of buffer: -* ``VALIDITY``: a JSON array of 1 (valid) and 0 (null). Data for non-nullable +* ``VALIDITY``: a JSON array of 1 (valid) and 0 (null). Data for non-nullable ``Field`` still has a ``VALIDITY`` array, even though all values are 1. * ``OFFSET``: a JSON array of integers for 32-bit offsets or - string-formatted integers for 64-bit offsets -* ``TYPE_ID``: a JSON array of integers -* ``DATA``: a JSON array of encoded values + string-formatted integers for 64-bit offsets. +* ``TYPE_ID``: a JSON array of integers. +* ``DATA``: a JSON array of encoded values. +* ``VARIADIC_DATA_BUFFERS``: a JSON array of data buffers represented as + hex encoded strings. +* ``VIEWS``: a JSON array of encoded views, which are JSON objects with: + * ``SIZE``: an integer indicating the size of the view, + * ``INLINED``: an encoded value (this field will be present if ``SIZE`` + is smaller than 12, otherwise the next three fields will be present), + * ``PREFIX_HEX``: the first four bytes of the view encoded as hex, + * ``BUFFER_INDEX``: the index in ``VARIADIC_DATA_BUFFERS`` of the buffer + viewed, + * ``OFFSET``: the offset in the buffer viewed. The value encoding for ``DATA`` is different depending on the logical type: @@ -527,6 +537,9 @@ in ``datagen.py``): - Signed indices - Unsigned indices - Nested dictionaries +* Run end encoded +* Binary view and string view +* List view and large list view * Extension Types diff --git a/go/arrow/array/encoded.go b/go/arrow/array/encoded.go index fa5fa7addf34c..8ca1416b92ab3 100644 --- a/go/arrow/array/encoded.go +++ b/go/arrow/array/encoded.go @@ -150,19 +150,19 @@ func (r *RunEndEncoded) LogicalRunEndsArray(mem memory.Allocator) arrow.Array { case *Int16: for _, v := range e.Int16Values()[physOffset : physOffset+physLength] { v -= int16(r.data.offset) - v = int16(utils.MinInt(int(v), r.data.length)) + v = int16(utils.Min(int(v), r.data.length)) bldr.(*Int16Builder).Append(v) } case *Int32: for _, v := range e.Int32Values()[physOffset : physOffset+physLength] { v -= int32(r.data.offset) - v = int32(utils.MinInt(int(v), r.data.length)) + v = int32(utils.Min(int(v), r.data.length)) bldr.(*Int32Builder).Append(v) } case *Int64: for _, v := range e.Int64Values()[physOffset : physOffset+physLength] { v -= int64(r.data.offset) - v = int64(utils.MinInt(int(v), r.data.length)) + v = int64(utils.Min(int(v), r.data.length)) bldr.(*Int64Builder).Append(v) } } diff --git a/go/arrow/array/list.go b/go/arrow/array/list.go index 4b62734116797..9d959b5e43b78 100644 --- a/go/arrow/array/list.go +++ b/go/arrow/array/list.go @@ -19,7 +19,6 @@ package array import ( "bytes" "fmt" - "math" "strings" "sync/atomic" @@ -1411,118 +1410,19 @@ func (b *baseListViewBuilder) UnmarshalJSON(data []byte) error { return b.Unmarshal(dec) } -// Pre-conditions: -// -// input.DataType() is ListViewType -// input.Len() > 0 && input.NullN() != input.Len() -func minListViewOffset32(input arrow.ArrayData) int32 { - var bitmap []byte - if input.Buffers()[0] != nil { - bitmap = input.Buffers()[0].Bytes() - } - offsets := arrow.Int32Traits.CastFromBytes(input.Buffers()[1].Bytes())[input.Offset():] - sizes := arrow.Int32Traits.CastFromBytes(input.Buffers()[2].Bytes())[input.Offset():] - - isNull := func(i int) bool { - return bitmap != nil && bitutil.BitIsNotSet(bitmap, input.Offset()+i) - } - - // It's very likely that the first non-null non-empty list-view starts at - // offset 0 of the child array. - i := 0 - for i < input.Len() && (isNull(i) || sizes[i] == 0) { - i += 1 - } - if i >= input.Len() { - return 0 - } - minOffset := offsets[i] - if minOffset == 0 { - // early exit: offset 0 found already - return 0 - } - - // Slow path: scan the buffers entirely. - i += 1 - for ; i < input.Len(); i += 1 { - if isNull(i) { - continue - } - offset := offsets[i] - if offset < minOffset && sizes[i] > 0 { - minOffset = offset - } - } - return minOffset -} - -// Find the maximum offset+size in a LIST_VIEW array. +// Find the minimum offset+size in a LIST_VIEW/LARGE_LIST_VIEW array. // // Pre-conditions: // -// input.DataType() is ListViewType -// input.Len() > 0 && input.NullN() != input.Len() -func maxListViewOffset32(input arrow.ArrayData) int { - inputOffset := input.Offset() - var bitmap []byte - if input.Buffers()[0] != nil { - bitmap = input.Buffers()[0].Bytes() - } - offsets := arrow.Int32Traits.CastFromBytes(input.Buffers()[1].Bytes())[inputOffset:] - sizes := arrow.Int32Traits.CastFromBytes(input.Buffers()[2].Bytes())[inputOffset:] - - isNull := func(i int) bool { - return bitmap != nil && bitutil.BitIsNotSet(bitmap, inputOffset+i) - } - - i := input.Len() - 1 // safe because input.Len() > 0 - for i != 0 && (isNull(i) || sizes[i] == 0) { - i -= 1 - } - offset := offsets[i] - size := sizes[i] - if i == 0 { - if isNull(i) || sizes[i] == 0 { - return 0 - } else { - return int(offset + size) - } - } - - values := input.Children()[0] - maxEnd := int(offsets[i] + sizes[i]) - if maxEnd == values.Len() { - // Early-exit: maximum possible view-end found already. - return maxEnd - } - - // Slow path: scan the buffers entirely. - for ; i >= 0; i -= 1 { - offset := offsets[i] - size := sizes[i] - if size > 0 && !isNull(i) { - if int(offset+size) > maxEnd { - maxEnd = int(offset + size) - if maxEnd == values.Len() { - return maxEnd - } - } - } - } - return maxEnd -} - -// Pre-conditions: -// -// input.DataType() is LargeListViewType +// input.DataType() is ListViewType if Offset=int32 or LargeListViewType if Offset=int64 // input.Len() > 0 && input.NullN() != input.Len() -func minLargeListViewOffset64(input arrow.ArrayData) int64 { +func minListViewOffset[Offset int32 | int64](input arrow.ArrayData) Offset { var bitmap []byte if input.Buffers()[0] != nil { bitmap = input.Buffers()[0].Bytes() } - offsets := arrow.Int64Traits.CastFromBytes(input.Buffers()[1].Bytes())[input.Offset():] - sizes := arrow.Int64Traits.CastFromBytes(input.Buffers()[2].Bytes())[input.Offset():] + offsets := arrow.GetData[Offset](input.Buffers()[1].Bytes())[input.Offset():] + sizes := arrow.GetData[Offset](input.Buffers()[2].Bytes())[input.Offset():] isNull := func(i int) bool { return bitmap != nil && bitutil.BitIsNotSet(bitmap, input.Offset()+i) @@ -1557,27 +1457,25 @@ func minLargeListViewOffset64(input arrow.ArrayData) int64 { return minOffset } -// Find the maximum offset+size in a LARGE_LIST_VIEW array. +// Find the maximum offset+size in a LIST_VIEW/LARGE_LIST_VIEW array. // // Pre-conditions: // -// input.DataType() is LargeListViewType +// input.DataType() is ListViewType if Offset=int32 or LargeListViewType if Offset=int64 // input.Len() > 0 && input.NullN() != input.Len() -func maxLargeListViewOffset64(input arrow.ArrayData) int64 { +func maxListViewEnd[Offset int32 | int64](input arrow.ArrayData) Offset { inputOffset := input.Offset() var bitmap []byte if input.Buffers()[0] != nil { bitmap = input.Buffers()[0].Bytes() } - offsets := arrow.Int64Traits.CastFromBytes(input.Buffers()[1].Bytes())[inputOffset:] - sizes := arrow.Int64Traits.CastFromBytes(input.Buffers()[2].Bytes())[inputOffset:] + offsets := arrow.GetData[Offset](input.Buffers()[1].Bytes())[inputOffset:] + sizes := arrow.GetData[Offset](input.Buffers()[2].Bytes())[inputOffset:] isNull := func(i int) bool { return bitmap != nil && bitutil.BitIsNotSet(bitmap, inputOffset+i) } - // It's very likely that the first non-null non-empty list-view starts at - // offset zero, so we check that first and potentially early-return a 0. i := input.Len() - 1 // safe because input.Len() > 0 for i != 0 && (isNull(i) || sizes[i] == 0) { i -= 1 @@ -1592,15 +1490,9 @@ func maxLargeListViewOffset64(input arrow.ArrayData) int64 { } } - if offset > math.MaxInt64-size { - // Early-exit: 64-bit overflow detected. This is not possible on a - // valid list-view, but we return the maximum possible value to - // avoid undefined behavior. - return math.MaxInt64 - } values := input.Children()[0] maxEnd := offsets[i] + sizes[i] - if maxEnd == int64(values.Len()) { + if maxEnd == Offset(values.Len()) { // Early-exit: maximum possible view-end found already. return maxEnd } @@ -1611,14 +1503,8 @@ func maxLargeListViewOffset64(input arrow.ArrayData) int64 { size := sizes[i] if size > 0 && !isNull(i) { if offset+size > maxEnd { - if offset > math.MaxInt64-size { - // 64-bit overflow detected. This is not possible on a valid list-view, - // but we saturate maxEnd to the maximum possible value to avoid - // undefined behavior. - return math.MaxInt64 - } maxEnd = offset + size - if maxEnd == int64(values.Len()) { + if maxEnd == Offset(values.Len()) { return maxEnd } } @@ -1634,11 +1520,11 @@ func rangeOfValuesUsed(input arrow.ArrayData) (int, int) { var minOffset, maxEnd int switch input.DataType().(type) { case *arrow.ListViewType: - minOffset = int(minListViewOffset32(input)) - maxEnd = maxListViewOffset32(input) + minOffset = int(minListViewOffset[int32](input)) + maxEnd = int(maxListViewEnd[int32](input)) case *arrow.LargeListViewType: - minOffset = int(minLargeListViewOffset64(input)) - maxEnd = int(maxLargeListViewOffset64(input)) + minOffset = int(minListViewOffset[int64](input)) + maxEnd = int(maxListViewEnd[int64](input)) case *arrow.ListType: offsets := arrow.Int32Traits.CastFromBytes(input.Buffers()[1].Bytes())[input.Offset():] minOffset = int(offsets[0]) diff --git a/go/arrow/avro/reader_types.go b/go/arrow/avro/reader_types.go index 5658c6e587db2..974fea1f14e5a 100644 --- a/go/arrow/avro/reader_types.go +++ b/go/arrow/avro/reader_types.go @@ -22,7 +22,7 @@ import ( "errors" "fmt" "math/big" - + "github.com/apache/arrow/go/v15/arrow" "github.com/apache/arrow/go/v15/arrow/array" "github.com/apache/arrow/go/v15/arrow/decimal128" diff --git a/go/arrow/bitutil/endian_default.go b/go/arrow/bitutil/endian_default.go index 9f5d3cdc7d256..ecbbaa70d04b6 100644 --- a/go/arrow/bitutil/endian_default.go +++ b/go/arrow/bitutil/endian_default.go @@ -14,6 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build !s390x // +build !s390x package bitutil diff --git a/go/arrow/bitutil/endian_s390x.go b/go/arrow/bitutil/endian_s390x.go index a9bba4391280e..e99605f5848fa 100644 --- a/go/arrow/bitutil/endian_s390x.go +++ b/go/arrow/bitutil/endian_s390x.go @@ -18,7 +18,7 @@ package bitutil import ( "math/bits" - "unsafe" + "unsafe" ) var toFromLEFunc = bits.ReverseBytes64 diff --git a/go/arrow/cdata/cdata.go b/go/arrow/cdata/cdata.go index f9693851d7483..64cc8456e8153 100644 --- a/go/arrow/cdata/cdata.go +++ b/go/arrow/cdata/cdata.go @@ -82,6 +82,8 @@ var formatToSimpleType = map[string]arrow.DataType{ "Z": arrow.BinaryTypes.LargeBinary, "u": arrow.BinaryTypes.String, "U": arrow.BinaryTypes.LargeString, + "vz": arrow.BinaryTypes.BinaryView, + "vu": arrow.BinaryTypes.StringView, "tdD": arrow.FixedWidthTypes.Date32, "tdm": arrow.FixedWidthTypes.Date64, "tts": arrow.FixedWidthTypes.Time32s, @@ -263,6 +265,12 @@ func importSchema(schema *CArrowSchema) (ret arrow.Field, err error) { dt = arrow.ListOfField(childFields[0]) case 'L': // large list dt = arrow.LargeListOfField(childFields[0]) + case 'v': // list view/large list view + if f[2] == 'l' { + dt = arrow.ListViewOfField(childFields[0]) + } else if f[2] == 'L' { + dt = arrow.LargeListViewOfField(childFields[0]) + } case 'w': // fixed size list is w:# where # is the list size. listSize, err := strconv.Atoi(strings.Split(f, ":")[1]) if err != nil { @@ -364,6 +372,16 @@ func (imp *cimporter) doImportChildren() error { if err := imp.children[0].importChild(imp, children[0]); err != nil { return err } + case arrow.LIST_VIEW: // only one child to import + imp.children[0].dt = imp.dt.(*arrow.ListViewType).Elem() + if err := imp.children[0].importChild(imp, children[0]); err != nil { + return err + } + case arrow.LARGE_LIST_VIEW: // only one child to import + imp.children[0].dt = imp.dt.(*arrow.LargeListViewType).Elem() + if err := imp.children[0].importChild(imp, children[0]); err != nil { + return err + } case arrow.FIXED_SIZE_LIST: // only one child to import imp.children[0].dt = imp.dt.(*arrow.FixedSizeListType).Elem() if err := imp.children[0].importChild(imp, children[0]); err != nil { @@ -485,10 +503,18 @@ func (imp *cimporter) doImport() error { return imp.importStringLike(int64(arrow.Int64SizeBytes)) case *arrow.LargeBinaryType: return imp.importStringLike(int64(arrow.Int64SizeBytes)) + case *arrow.StringViewType: + return imp.importBinaryViewLike() + case *arrow.BinaryViewType: + return imp.importBinaryViewLike() case *arrow.ListType: return imp.importListLike() case *arrow.LargeListType: return imp.importListLike() + case *arrow.ListViewType: + return imp.importListViewLike() + case *arrow.LargeListViewType: + return imp.importListViewLike() case *arrow.MapType: return imp.importListLike() case *arrow.FixedSizeListType: @@ -654,6 +680,33 @@ func (imp *cimporter) importStringLike(offsetByteWidth int64) (err error) { return } +func (imp *cimporter) importBinaryViewLike() (err error) { + if err = imp.checkNoChildren(); err != nil { + return + } + + buffers := make([]*memory.Buffer, len(imp.cbuffers)-1) + defer memory.ReleaseBuffers(buffers) + + if buffers[0], err = imp.importNullBitmap(0); err != nil { + return + } + + if buffers[1], err = imp.importFixedSizeBuffer(1, int64(arrow.ViewHeaderSizeBytes)); err != nil { + return + } + + dataBufferSizes := unsafe.Slice((*int64)(unsafe.Pointer(imp.cbuffers[len(buffers)])), len(buffers)-2) + for i, size := range dataBufferSizes { + if buffers[i+2], err = imp.importVariableValuesBuffer(i+2, 1, size); err != nil { + return + } + } + + imp.data = array.NewData(imp.dt, int(imp.arr.length), buffers, nil, int(imp.arr.null_count), int(imp.arr.offset)) + return +} + func (imp *cimporter) importListLike() (err error) { if err = imp.checkNumChildren(1); err != nil { return err @@ -683,6 +736,43 @@ func (imp *cimporter) importListLike() (err error) { return } +func (imp *cimporter) importListViewLike() (err error) { + offsetSize := int64(imp.dt.Layout().Buffers[1].ByteWidth) + + if err = imp.checkNumChildren(1); err != nil { + return err + } + + if err = imp.checkNumBuffers(3); err != nil { + return err + } + + var nulls, offsets, sizes *memory.Buffer + if nulls, err = imp.importNullBitmap(0); err != nil { + return + } + if nulls != nil { + defer nulls.Release() + } + + if offsets, err = imp.importFixedSizeBuffer(1, offsetSize); err != nil { + return + } + if offsets != nil { + defer offsets.Release() + } + + if sizes, err = imp.importFixedSizeBuffer(2, offsetSize); err != nil { + return + } + if sizes != nil { + defer sizes.Release() + } + + imp.data = array.NewData(imp.dt, int(imp.arr.length), []*memory.Buffer{nulls, offsets, sizes}, []arrow.ArrayData{imp.children[0].data}, int(imp.arr.null_count), int(imp.arr.offset)) + return +} + func (imp *cimporter) importFixedSizePrimitive() error { if err := imp.checkNoChildren(); err != nil { return err diff --git a/go/arrow/cdata/cdata_exports.go b/go/arrow/cdata/cdata_exports.go index d5fdc0dac15df..9c7c238ffb7b4 100644 --- a/go/arrow/cdata/cdata_exports.go +++ b/go/arrow/cdata/cdata_exports.go @@ -167,6 +167,10 @@ func (exp *schemaExporter) exportFormat(dt arrow.DataType) string { return "u" case *arrow.LargeStringType: return "U" + case *arrow.BinaryViewType: + return "vz" + case *arrow.StringViewType: + return "vu" case *arrow.Date32Type: return "tdD" case *arrow.Date64Type: @@ -228,6 +232,10 @@ func (exp *schemaExporter) exportFormat(dt arrow.DataType) string { return "+l" case *arrow.LargeListType: return "+L" + case *arrow.ListViewType: + return "+vl" + case *arrow.LargeListViewType: + return "+vL" case *arrow.FixedSizeListType: return fmt.Sprintf("+w:%d", dt.Len()) case *arrow.StructType: @@ -328,6 +336,15 @@ func allocateBufferPtrArr(n int) (out []*C.void) { return } +func allocateBufferSizeArr(n int) (out []C.int64_t) { + s := (*reflect.SliceHeader)(unsafe.Pointer(&out)) + s.Data = uintptr(C.calloc(C.size_t(n), C.size_t(unsafe.Sizeof(int64(0))))) + s.Len = n + s.Cap = n + + return +} + func (exp *schemaExporter) finish(out *CArrowSchema) { out.dictionary = nil if exp.dict != nil { @@ -368,15 +385,19 @@ func exportArray(arr arrow.Array, out *CArrowArray, outSchema *CArrowSchema) { exportField(arrow.Field{Type: arr.DataType()}, outSchema) } - nbuffers := len(arr.Data().Buffers()) - buf_offset := 0 + buffers := arr.Data().Buffers() // Some types don't have validity bitmaps, but we keep them shifted // to make processing easier in other contexts. This means that // we have to adjust when exporting. has_validity_bitmap := internal.DefaultHasValidityBitmap(arr.DataType().ID()) - if nbuffers > 0 && !has_validity_bitmap { - nbuffers-- - buf_offset++ + if len(buffers) > 0 && !has_validity_bitmap { + buffers = buffers[1:] + } + nbuffers := len(buffers) + + has_buffer_sizes_buffer := internal.HasBufferSizesBuffer(arr.DataType().ID()) + if has_buffer_sizes_buffer { + nbuffers++ } out.dictionary = nil @@ -387,25 +408,34 @@ func exportArray(arr arrow.Array, out *CArrowArray, outSchema *CArrowSchema) { out.buffers = nil if nbuffers > 0 { - bufs := arr.Data().Buffers() - buffers := allocateBufferPtrArr(nbuffers) - for i, buf := range bufs[buf_offset:] { + cBufs := allocateBufferPtrArr(nbuffers) + for i, buf := range buffers { if buf == nil || buf.Len() == 0 { if i > 0 || !has_validity_bitmap { // apache/arrow#33936: export a dummy buffer to be friendly to // implementations that don't import NULL properly - buffers[i] = (*C.void)(unsafe.Pointer(&C.kGoCdataZeroRegion)) + cBufs[i] = (*C.void)(unsafe.Pointer(&C.kGoCdataZeroRegion)) } else { // null pointer permitted for the validity bitmap // (assuming null count is 0) - buffers[i] = nil + cBufs[i] = nil } continue } - buffers[i] = (*C.void)(unsafe.Pointer(&buf.Bytes()[0])) + cBufs[i] = (*C.void)(unsafe.Pointer(&buf.Bytes()[0])) + } + + if has_buffer_sizes_buffer { + sizes := allocateBufferSizeArr(len(buffers[2:])) + for i, buf := range buffers[2:] { + sizes[i] = C.int64_t(buf.Len()) + } + if len(sizes) > 0 { + cBufs[nbuffers-1] = (*C.void)(unsafe.Pointer(&sizes[0])) + } } - out.buffers = (*unsafe.Pointer)(unsafe.Pointer(&buffers[0])) + out.buffers = (*unsafe.Pointer)(unsafe.Pointer(&cBufs[0])) } arr.Data().Retain() diff --git a/go/arrow/compute/arithmetic.go b/go/arrow/compute/arithmetic.go index 1ee1959b2ddc8..052d79610bcba 100644 --- a/go/arrow/compute/arithmetic.go +++ b/go/arrow/compute/arithmetic.go @@ -678,8 +678,8 @@ func RegisterScalarArithmetic(reg FunctionRegistry) { // the allocated space is for duration (an int64) but we // wrote the time32 - time32 as if the output was time32 // so a quick copy in reverse expands the int32s to int64. - rawData := exec.GetData[int32](out.Buffers[1].Buf) - outData := exec.GetData[int64](out.Buffers[1].Buf) + rawData := arrow.GetData[int32](out.Buffers[1].Buf) + outData := arrow.GetData[int64](out.Buffers[1].Buf) for i := out.Len - 1; i >= 0; i-- { outData[i] = int64(rawData[i]) diff --git a/go/arrow/compute/arithmetic_test.go b/go/arrow/compute/arithmetic_test.go index c9c3f1ceb03e9..34c1bc6d98d65 100644 --- a/go/arrow/compute/arithmetic_test.go +++ b/go/arrow/compute/arithmetic_test.go @@ -195,7 +195,7 @@ func (b *Float16BinaryFuncTestSuite) TestSub() { } } -type BinaryArithmeticSuite[T exec.NumericTypes] struct { +type BinaryArithmeticSuite[T arrow.NumericType] struct { BinaryFuncTestSuite opts compute.ArithmeticOptions @@ -205,7 +205,7 @@ type BinaryArithmeticSuite[T exec.NumericTypes] struct { } func (BinaryArithmeticSuite[T]) DataType() arrow.DataType { - return exec.GetDataType[T]() + return arrow.GetDataType[T]() } func (b *BinaryArithmeticSuite[T]) setNansEqual(val bool) { @@ -564,7 +564,7 @@ func (bs *BinaryFloatingArithmeticSuite[T]) TestLog() { bs.assertBinopErr(compute.Logb, `["-Inf"]`, `[2]`, "logarithm of negative number") } -type BinaryIntegralArithmeticSuite[T exec.IntTypes | exec.UintTypes] struct { +type BinaryIntegralArithmeticSuite[T arrow.IntType | arrow.UintType] struct { BinaryArithmeticSuite[T] } @@ -2412,7 +2412,7 @@ func TestUnaryArithmeticNull(t *testing.T) { } } -type UnaryArithmeticSuite[T exec.NumericTypes, O fnOpts] struct { +type UnaryArithmeticSuite[T arrow.NumericType, O fnOpts] struct { suite.Suite mem *memory.CheckedAllocator @@ -2433,7 +2433,7 @@ func (us *UnaryArithmeticSuite[T, O]) TearDownTest() { } func (*UnaryArithmeticSuite[T, O]) datatype() arrow.DataType { - return exec.GetDataType[T]() + return arrow.GetDataType[T]() } func (us *UnaryArithmeticSuite[T, O]) makeNullScalar() scalar.Scalar { @@ -2532,7 +2532,7 @@ func (us *UnaryArithmeticSuite[T, O]) assertUnaryOpErr(fn unaryArithmeticFunc[O] us.ErrorContains(err, msg) } -type UnaryArithmeticIntegral[T exec.IntTypes | exec.UintTypes] struct { +type UnaryArithmeticIntegral[T arrow.IntType | arrow.UintType] struct { UnaryArithmeticSuite[T, compute.ArithmeticOptions] } @@ -2598,7 +2598,7 @@ func (us *UnaryArithmeticIntegral[T]) TestLog() { } } -type UnaryArithmeticSigned[T exec.IntTypes] struct { +type UnaryArithmeticSigned[T arrow.IntType] struct { UnaryArithmeticIntegral[T] } @@ -2678,7 +2678,7 @@ func (us *UnaryArithmeticSigned[T]) TestNegate() { }) } -type UnaryArithmeticUnsigned[T exec.UintTypes] struct { +type UnaryArithmeticUnsigned[T arrow.UintType] struct { UnaryArithmeticIntegral[T] } @@ -2965,12 +2965,12 @@ func TestUnaryArithmetic(t *testing.T) { suite.Run(t, new(DecimalUnaryArithmeticSuite)) } -type BitwiseArithmeticSuite[T exec.IntTypes | exec.UintTypes] struct { +type BitwiseArithmeticSuite[T arrow.IntType | arrow.UintType] struct { BinaryFuncTestSuite } func (bs *BitwiseArithmeticSuite[T]) datatype() arrow.DataType { - return exec.GetDataType[T]() + return arrow.GetDataType[T]() } // to make it easier to test different widths, tests give bytes which @@ -3061,7 +3061,7 @@ var roundModes = []compute.RoundMode{ compute.RoundHalfToOdd, } -type UnaryRoundSuite[T exec.NumericTypes] struct { +type UnaryRoundSuite[T arrow.NumericType] struct { UnaryArithmeticSuite[T, compute.RoundOptions] } @@ -3073,7 +3073,7 @@ func (us *UnaryRoundSuite[T]) setRoundNDigits(v int64) { us.opts.NDigits = v } -type UnaryRoundToMultipleSuite[T exec.NumericTypes] struct { +type UnaryRoundToMultipleSuite[T arrow.NumericType] struct { UnaryArithmeticSuite[T, compute.RoundToMultipleOptions] } @@ -3085,15 +3085,15 @@ func (us *UnaryRoundToMultipleSuite[T]) setRoundMultiple(val float64) { us.opts.Multiple = scalar.NewFloat64Scalar(val) } -type UnaryRoundIntegral[T exec.IntTypes | exec.UintTypes] struct { +type UnaryRoundIntegral[T arrow.IntType | arrow.UintType] struct { UnaryRoundSuite[T] } -type UnaryRoundToMultipleIntegral[T exec.IntTypes | exec.UintTypes] struct { +type UnaryRoundToMultipleIntegral[T arrow.IntType | arrow.UintType] struct { UnaryRoundToMultipleSuite[T] } -type UnaryRoundSigned[T exec.IntTypes] struct { +type UnaryRoundSigned[T arrow.IntType] struct { UnaryRoundIntegral[T] } @@ -3130,7 +3130,7 @@ func (us *UnaryRoundSigned[T]) TestRound() { } } -type UnaryRoundToMultipleSigned[T exec.IntTypes] struct { +type UnaryRoundToMultipleSigned[T arrow.IntType] struct { UnaryRoundToMultipleIntegral[T] } @@ -3164,7 +3164,7 @@ func (us *UnaryRoundToMultipleSigned[T]) TestRoundToMultiple() { } } -type UnaryRoundUnsigned[T exec.UintTypes] struct { +type UnaryRoundUnsigned[T arrow.UintType] struct { UnaryRoundIntegral[T] } @@ -3201,7 +3201,7 @@ func (us *UnaryRoundUnsigned[T]) TestRound() { } } -type UnaryRoundToMultipleUnsigned[T exec.UintTypes] struct { +type UnaryRoundToMultipleUnsigned[T arrow.UintType] struct { UnaryRoundToMultipleIntegral[T] } diff --git a/go/arrow/compute/exec/utils.go b/go/arrow/compute/exec/utils.go index 276e4570ca968..1b5e69a502cfd 100644 --- a/go/arrow/compute/exec/utils.go +++ b/go/arrow/compute/exec/utils.go @@ -21,96 +21,21 @@ package exec import ( "fmt" "math" - "reflect" "sync/atomic" "unsafe" "github.com/apache/arrow/go/v15/arrow" "github.com/apache/arrow/go/v15/arrow/array" "github.com/apache/arrow/go/v15/arrow/bitutil" - "github.com/apache/arrow/go/v15/arrow/decimal128" - "github.com/apache/arrow/go/v15/arrow/decimal256" - "github.com/apache/arrow/go/v15/arrow/float16" "github.com/apache/arrow/go/v15/arrow/memory" "golang.org/x/exp/constraints" "golang.org/x/exp/slices" ) -// IntTypes is a type constraint for raw values represented as signed -// integer types by Arrow. We aren't just using constraints.Signed -// because we don't want to include the raw `int` type here whose size -// changes based on the architecture (int32 on 32-bit architectures and -// int64 on 64-bit architectures). -// -// This will also cover types like MonthInterval or the time types -// as their underlying types are int32 and int64 which will get covered -// by using the ~ -type IntTypes interface { - ~int8 | ~int16 | ~int32 | ~int64 -} - -// UintTypes is a type constraint for raw values represented as unsigned -// integer types by Arrow. We aren't just using constraints.Unsigned -// because we don't want to include the raw `uint` type here whose size -// changes based on the architecture (uint32 on 32-bit architectures and -// uint64 on 64-bit architectures). We also don't want to include uintptr -type UintTypes interface { - ~uint8 | ~uint16 | ~uint32 | ~uint64 -} - -// FloatTypes is a type constraint for raw values for representing -// floating point values in Arrow. This consists of constraints.Float and -// float16.Num -type FloatTypes interface { - float16.Num | constraints.Float -} - -// NumericTypes is a type constraint for just signed/unsigned integers -// and float32/float64. -type NumericTypes interface { - IntTypes | UintTypes | constraints.Float -} - -// DecimalTypes is a type constraint for raw values representing larger -// decimal type values in Arrow, specifically decimal128 and decimal256. -type DecimalTypes interface { - decimal128.Num | decimal256.Num -} - -// FixedWidthTypes is a type constraint for raw values in Arrow that -// can be represented as FixedWidth byte slices. Specifically this is for -// using Go generics to easily re-type a byte slice to a properly-typed -// slice. Booleans are excluded here since they are represented by Arrow -// as a bitmap and thus the buffer can't be just reinterpreted as a []bool -type FixedWidthTypes interface { - IntTypes | UintTypes | - FloatTypes | DecimalTypes | - arrow.DayTimeInterval | arrow.MonthDayNanoInterval -} - -type TemporalTypes interface { - arrow.Date32 | arrow.Date64 | arrow.Time32 | arrow.Time64 | - arrow.Timestamp | arrow.Duration | arrow.DayTimeInterval | - arrow.MonthInterval | arrow.MonthDayNanoInterval -} - -func GetValues[T FixedWidthTypes](data arrow.ArrayData, i int) []T { - if data.Buffers()[i] == nil || data.Buffers()[i].Len() == 0 { - return nil - } - ret := unsafe.Slice((*T)(unsafe.Pointer(&data.Buffers()[i].Bytes()[0])), data.Offset()+data.Len()) - return ret[data.Offset():] -} - -func GetOffsets[T int32 | int64](data arrow.ArrayData, i int) []T { - ret := unsafe.Slice((*T)(unsafe.Pointer(&data.Buffers()[i].Bytes()[0])), data.Offset()+data.Len()+1) - return ret[data.Offset():] -} - // GetSpanValues returns a properly typed slice by reinterpreting // the buffer at index i using unsafe.Slice. This will take into account // the offset of the given ArraySpan. -func GetSpanValues[T FixedWidthTypes](span *ArraySpan, i int) []T { +func GetSpanValues[T arrow.FixedWidthType](span *ArraySpan, i int) []T { if len(span.Buffers[i].Buf) == 0 { return nil } @@ -126,16 +51,6 @@ func GetSpanOffsets[T int32 | int64](span *ArraySpan, i int) []T { return ret[span.Offset:] } -func GetBytes[T FixedWidthTypes](in []T) []byte { - var z T - return unsafe.Slice((*byte)(unsafe.Pointer(&in[0])), len(in)*int(unsafe.Sizeof(z))) -} - -func GetData[T FixedWidthTypes](in []byte) []T { - var z T - return unsafe.Slice((*T)(unsafe.Pointer(&in[0])), len(in)/int(unsafe.Sizeof(z))) -} - func Min[T constraints.Ordered](a, b T) T { if a < b { return a @@ -165,59 +80,22 @@ func OptionsInit[T any](_ *KernelCtx, args KernelInitArgs) (KernelState, error) arrow.ErrInvalid) } -var typMap = map[reflect.Type]arrow.DataType{ - reflect.TypeOf(false): arrow.FixedWidthTypes.Boolean, - reflect.TypeOf(int8(0)): arrow.PrimitiveTypes.Int8, - reflect.TypeOf(int16(0)): arrow.PrimitiveTypes.Int16, - reflect.TypeOf(int32(0)): arrow.PrimitiveTypes.Int32, - reflect.TypeOf(int64(0)): arrow.PrimitiveTypes.Int64, - reflect.TypeOf(uint8(0)): arrow.PrimitiveTypes.Uint8, - reflect.TypeOf(uint16(0)): arrow.PrimitiveTypes.Uint16, - reflect.TypeOf(uint32(0)): arrow.PrimitiveTypes.Uint32, - reflect.TypeOf(uint64(0)): arrow.PrimitiveTypes.Uint64, - reflect.TypeOf(float32(0)): arrow.PrimitiveTypes.Float32, - reflect.TypeOf(float64(0)): arrow.PrimitiveTypes.Float64, - reflect.TypeOf(string("")): arrow.BinaryTypes.String, - reflect.TypeOf(arrow.Date32(0)): arrow.FixedWidthTypes.Date32, - reflect.TypeOf(arrow.Date64(0)): arrow.FixedWidthTypes.Date64, - reflect.TypeOf(true): arrow.FixedWidthTypes.Boolean, - reflect.TypeOf(float16.Num{}): arrow.FixedWidthTypes.Float16, - reflect.TypeOf([]byte{}): arrow.BinaryTypes.Binary, -} - -// GetDataType returns the appropriate arrow.DataType for the given type T -// only for non-parametric types. This uses a map and reflection internally -// so don't call this in a tight loop, instead call this once and then use -// a closure with the result. -func GetDataType[T NumericTypes | bool | string | []byte | float16.Num]() arrow.DataType { - var z T - return typMap[reflect.TypeOf(z)] -} - -// GetType returns the appropriate arrow.Type type T, only for non-parametric -// types. This uses a map and reflection internally so don't call this in -// a tight loop, instead call it once and then use a closure with the result. -func GetType[T NumericTypes | bool | string]() arrow.Type { - var z T - return typMap[reflect.TypeOf(z)].ID() -} - -type arrayBuilder[T NumericTypes | bool] interface { +type arrayBuilder[T arrow.NumericType | bool] interface { array.Builder Append(T) AppendValues([]T, []bool) } -func ArrayFromSlice[T NumericTypes | bool](mem memory.Allocator, data []T) arrow.Array { - bldr := array.NewBuilder(mem, typMap[reflect.TypeOf(data).Elem()]).(arrayBuilder[T]) +func ArrayFromSlice[T arrow.NumericType | bool](mem memory.Allocator, data []T) arrow.Array { + bldr := array.NewBuilder(mem, arrow.GetDataType[T]()).(arrayBuilder[T]) defer bldr.Release() bldr.AppendValues(data, nil) return bldr.NewArray() } -func ArrayFromSliceWithValid[T NumericTypes | bool](mem memory.Allocator, data []T, valid []bool) arrow.Array { - bldr := array.NewBuilder(mem, typMap[reflect.TypeOf(data).Elem()]).(arrayBuilder[T]) +func ArrayFromSliceWithValid[T arrow.NumericType | bool](mem memory.Allocator, data []T, valid []bool) arrow.Array { + bldr := array.NewBuilder(mem, arrow.GetDataType[T]()).(arrayBuilder[T]) defer bldr.Release() bldr.AppendValues(data, valid) @@ -323,7 +201,7 @@ func (c *ChunkResolver) Resolve(idx int64) (chunk, index int64) { } type arrayTypes interface { - FixedWidthTypes | TemporalTypes | bool | string | []byte + arrow.FixedWidthType | arrow.TemporalType | bool | string | []byte } type ArrayIter[T arrayTypes] interface { @@ -345,11 +223,11 @@ func (b *BoolIter) Next() (out bool) { return } -type PrimitiveIter[T FixedWidthTypes] struct { +type PrimitiveIter[T arrow.FixedWidthType] struct { Values []T } -func NewPrimitiveIter[T FixedWidthTypes](arr *ArraySpan) ArrayIter[T] { +func NewPrimitiveIter[T arrow.FixedWidthType](arr *ArraySpan) ArrayIter[T] { return &PrimitiveIter[T]{Values: GetSpanValues[T](arr, 1)} } diff --git a/go/arrow/compute/exec/utils_test.go b/go/arrow/compute/exec/utils_test.go index b26e4ff41e79f..345d6dcf3b4c4 100644 --- a/go/arrow/compute/exec/utils_test.go +++ b/go/arrow/compute/exec/utils_test.go @@ -53,7 +53,7 @@ func TestRechunkConsistentArraysTrivial(t *testing.T) { } } -func assertEqual[T exec.NumericTypes](t *testing.T, mem memory.Allocator, arr arrow.Array, data []T) { +func assertEqual[T arrow.NumericType](t *testing.T, mem memory.Allocator, arr arrow.Array, data []T) { exp := exec.ArrayFromSlice(mem, data) defer exp.Release() assert.Truef(t, array.Equal(exp, arr), "expected: %s\ngot: %s", exp, arr) diff --git a/go/arrow/compute/fieldref.go b/go/arrow/compute/fieldref.go index 565ae3bfadbd0..036e1e355ed75 100644 --- a/go/arrow/compute/fieldref.go +++ b/go/arrow/compute/fieldref.go @@ -282,31 +282,31 @@ type refImpl interface { // // Nested fields can be referenced as well, given the schema: // -// arrow.NewSchema([]arrow.Field{ -// {Name: "a", Type: arrow.StructOf(arrow.Field{Name: "n", Type: arrow.Null})}, -// {Name: "b", Type: arrow.PrimitiveTypes.Int32}, -// }) +// arrow.NewSchema([]arrow.Field{ +// {Name: "a", Type: arrow.StructOf(arrow.Field{Name: "n", Type: arrow.Null})}, +// {Name: "b", Type: arrow.PrimitiveTypes.Int32}, +// }) // // the following all indicate the nested field named "n": // -// FieldRefPath(FieldPath{0, 0}) -// FieldRefList("a", 0) -// FieldRefList("a", "n") -// FieldRefList(0, "n") -// NewFieldRefFromDotPath(".a[0]") +// FieldRefPath(FieldPath{0, 0}) +// FieldRefList("a", 0) +// FieldRefList("a", "n") +// FieldRefList(0, "n") +// NewFieldRefFromDotPath(".a[0]") // // FieldPaths matching a FieldRef are retrieved with the FindAll* functions // Multiple matches are possible because field names may be duplicated within // a schema. For example: // -// aIsAmbiguous := arrow.NewSchema([]arrow.Field{ -// {Name: "a", Type: arrow.PrimitiveTypes.Int32}, -// {Name: "a", Type: arrow.PrimitiveTypes.Float32}, -// }) -// matches := FieldRefName("a").FindAll(aIsAmbiguous) -// assert.Len(matches, 2) -// assert.True(matches[0].Get(aIsAmbiguous).Equals(aIsAmbiguous.Field(0)) -// assert.True(matches[1].Get(aIsAmbiguous).Equals(aIsAmbiguous.Field(1)) +// aIsAmbiguous := arrow.NewSchema([]arrow.Field{ +// {Name: "a", Type: arrow.PrimitiveTypes.Int32}, +// {Name: "a", Type: arrow.PrimitiveTypes.Float32}, +// }) +// matches := FieldRefName("a").FindAll(aIsAmbiguous) +// assert.Len(matches, 2) +// assert.True(matches[0].Get(aIsAmbiguous).Equals(aIsAmbiguous.Field(0)) +// assert.True(matches[1].Get(aIsAmbiguous).Equals(aIsAmbiguous.Field(1)) type FieldRef struct { impl refImpl } @@ -346,17 +346,18 @@ func FieldRefList(elems ...interface{}) FieldRef { // NewFieldRefFromDotPath parses a dot path into a field ref. // // dot_path = '.' name -// | '[' digit+ ']' -// | dot_path+ +// +// | '[' digit+ ']' +// | dot_path+ // // Examples // -// ".alpha" => FieldRefName("alpha") -// "[2]" => FieldRefIndex(2) -// ".beta[3]" => FieldRefList("beta", 3) -// "[5].gamma.delta[7]" => FieldRefList(5, "gamma", "delta", 7) -// ".hello world" => FieldRefName("hello world") -// `.\[y\]\\tho\.\` => FieldRef(`[y]\tho.\`) +// ".alpha" => FieldRefName("alpha") +// "[2]" => FieldRefIndex(2) +// ".beta[3]" => FieldRefList("beta", 3) +// "[5].gamma.delta[7]" => FieldRefList(5, "gamma", "delta", 7) +// ".hello world" => FieldRefName("hello world") +// `.\[y\]\\tho\.\` => FieldRef(`[y]\tho.\`) // // Note: when parsing a name, a '\' preceding any other character will be // dropped from the resulting name. therefore if a name must contain the characters diff --git a/go/arrow/compute/internal/kernels/base_arithmetic.go b/go/arrow/compute/internal/kernels/base_arithmetic.go index 4ef0031f31484..b795c04c39ead 100644 --- a/go/arrow/compute/internal/kernels/base_arithmetic.go +++ b/go/arrow/compute/internal/kernels/base_arithmetic.go @@ -81,7 +81,7 @@ const ( OpLogbChecked ) -func mulWithOverflow[T exec.IntTypes | exec.UintTypes](a, b T) (T, error) { +func mulWithOverflow[T arrow.IntType | arrow.UintType](a, b T) (T, error) { min, max := MinOf[T](), MaxOf[T]() switch { case a > 0: @@ -107,7 +107,7 @@ func mulWithOverflow[T exec.IntTypes | exec.UintTypes](a, b T) (T, error) { return a * b, nil } -func getGoArithmeticBinary[OutT, Arg0T, Arg1T exec.NumericTypes](op func(a Arg0T, b Arg1T, e *error) OutT) binaryOps[OutT, Arg0T, Arg1T] { +func getGoArithmeticBinary[OutT, Arg0T, Arg1T arrow.NumericType](op func(a Arg0T, b Arg1T, e *error) OutT) binaryOps[OutT, Arg0T, Arg1T] { return binaryOps[OutT, Arg0T, Arg1T]{ arrArr: func(_ *exec.KernelCtx, left []Arg0T, right []Arg1T, out []OutT) error { var err error @@ -143,7 +143,7 @@ var ( errLogNeg = fmt.Errorf("%w: logarithm of negative number", arrow.ErrInvalid) ) -func getGoArithmeticOpIntegral[InT, OutT exec.UintTypes | exec.IntTypes](op ArithmeticOp) exec.ArrayKernelExec { +func getGoArithmeticOpIntegral[InT, OutT arrow.UintType | arrow.IntType](op ArithmeticOp) exec.ArrayKernelExec { switch op { case OpAdd: return ScalarBinary(getGoArithmeticBinary(func(a, b InT, _ *error) OutT { return OutT(a + b) })) @@ -178,7 +178,7 @@ func getGoArithmeticOpIntegral[InT, OutT exec.UintTypes | exec.IntTypes](op Arit if SizeOf[InT]() == SizeOf[OutT]() { return ScalarUnary(func(_ *exec.KernelCtx, arg []InT, out []OutT) error { - in, output := exec.GetBytes(arg), exec.GetBytes(out) + in, output := arrow.GetBytes(arg), arrow.GetBytes(out) copy(output, in) return nil }) @@ -314,7 +314,7 @@ func getGoArithmeticOpIntegral[InT, OutT exec.UintTypes | exec.IntTypes](op Arit } if SizeOf[InT]() == SizeOf[OutT]() { return ScalarUnary(func(_ *exec.KernelCtx, arg []InT, out []OutT) error { - in, output := exec.GetBytes(arg), exec.GetBytes(out) + in, output := arrow.GetBytes(arg), arrow.GetBytes(out) copy(output, in) return nil }) @@ -837,7 +837,7 @@ func ArithmeticExecSameType(ty arrow.Type, op ArithmeticOp) exec.ArrayKernelExec return nil } -func arithmeticExec[InT exec.IntTypes | exec.UintTypes](oty arrow.Type, op ArithmeticOp) exec.ArrayKernelExec { +func arithmeticExec[InT arrow.IntType | arrow.UintType](oty arrow.Type, op ArithmeticOp) exec.ArrayKernelExec { switch oty { case arrow.INT8: return getArithmeticOpIntegral[InT, int8](op) diff --git a/go/arrow/compute/internal/kernels/base_arithmetic_amd64.go b/go/arrow/compute/internal/kernels/base_arithmetic_amd64.go index 942b8e4ff5600..51b1866fb68fa 100644 --- a/go/arrow/compute/internal/kernels/base_arithmetic_amd64.go +++ b/go/arrow/compute/internal/kernels/base_arithmetic_amd64.go @@ -21,63 +21,64 @@ package kernels import ( "unsafe" + "github.com/apache/arrow/go/v15/arrow" "github.com/apache/arrow/go/v15/arrow/compute/exec" "github.com/apache/arrow/go/v15/arrow/internal/debug" "golang.org/x/exp/constraints" "golang.org/x/sys/cpu" ) -func getAvx2ArithmeticBinaryNumeric[T exec.NumericTypes](op ArithmeticOp) binaryOps[T, T, T] { - typ := exec.GetType[T]() +func getAvx2ArithmeticBinaryNumeric[T arrow.NumericType](op ArithmeticOp) binaryOps[T, T, T] { + typ := arrow.GetType[T]() return binaryOps[T, T, T]{ arrArr: func(_ *exec.KernelCtx, Arg0, Arg1, Out []T) error { - arithmeticAvx2(typ, op, exec.GetBytes(Arg0), exec.GetBytes(Arg1), exec.GetBytes(Out), len(Arg0)) + arithmeticAvx2(typ, op, arrow.GetBytes(Arg0), arrow.GetBytes(Arg1), arrow.GetBytes(Out), len(Arg0)) return nil }, arrScalar: func(_ *exec.KernelCtx, Arg0 []T, Arg1 T, Out []T) error { - arithmeticArrScalarAvx2(typ, op, exec.GetBytes(Arg0), unsafe.Pointer(&Arg1), exec.GetBytes(Out), len(Arg0)) + arithmeticArrScalarAvx2(typ, op, arrow.GetBytes(Arg0), unsafe.Pointer(&Arg1), arrow.GetBytes(Out), len(Arg0)) return nil }, scalarArr: func(_ *exec.KernelCtx, Arg0 T, Arg1, Out []T) error { - arithmeticScalarArrAvx2(typ, op, unsafe.Pointer(&Arg0), exec.GetBytes(Arg1), exec.GetBytes(Out), len(Arg1)) + arithmeticScalarArrAvx2(typ, op, unsafe.Pointer(&Arg0), arrow.GetBytes(Arg1), arrow.GetBytes(Out), len(Arg1)) return nil }, } } -func getSSE4ArithmeticBinaryNumeric[T exec.NumericTypes](op ArithmeticOp) binaryOps[T, T, T] { - typ := exec.GetType[T]() +func getSSE4ArithmeticBinaryNumeric[T arrow.NumericType](op ArithmeticOp) binaryOps[T, T, T] { + typ := arrow.GetType[T]() return binaryOps[T, T, T]{ arrArr: func(_ *exec.KernelCtx, Arg0, Arg1, Out []T) error { - arithmeticSSE4(typ, op, exec.GetBytes(Arg0), exec.GetBytes(Arg1), exec.GetBytes(Out), len(Arg0)) + arithmeticSSE4(typ, op, arrow.GetBytes(Arg0), arrow.GetBytes(Arg1), arrow.GetBytes(Out), len(Arg0)) return nil }, arrScalar: func(_ *exec.KernelCtx, Arg0 []T, Arg1 T, Out []T) error { - arithmeticArrScalarSSE4(typ, op, exec.GetBytes(Arg0), unsafe.Pointer(&Arg1), exec.GetBytes(Out), len(Arg0)) + arithmeticArrScalarSSE4(typ, op, arrow.GetBytes(Arg0), unsafe.Pointer(&Arg1), arrow.GetBytes(Out), len(Arg0)) return nil }, scalarArr: func(_ *exec.KernelCtx, Arg0 T, Arg1, Out []T) error { - arithmeticScalarArrSSE4(typ, op, unsafe.Pointer(&Arg0), exec.GetBytes(Arg1), exec.GetBytes(Out), len(Arg1)) + arithmeticScalarArrSSE4(typ, op, unsafe.Pointer(&Arg0), arrow.GetBytes(Arg1), arrow.GetBytes(Out), len(Arg1)) return nil }, } } -func getArithmeticOpIntegral[InT, OutT exec.UintTypes | exec.IntTypes](op ArithmeticOp) exec.ArrayKernelExec { +func getArithmeticOpIntegral[InT, OutT arrow.UintType | arrow.IntType](op ArithmeticOp) exec.ArrayKernelExec { if cpu.X86.HasAVX2 { switch op { case OpAdd, OpSub, OpMul: return ScalarBinary(getAvx2ArithmeticBinaryNumeric[InT](op)) case OpAbsoluteValue, OpNegate: - typ := exec.GetType[InT]() + typ := arrow.GetType[InT]() return ScalarUnary(func(_ *exec.KernelCtx, arg, out []InT) error { - arithmeticUnaryAvx2(typ, op, exec.GetBytes(arg), exec.GetBytes(out), len(arg)) + arithmeticUnaryAvx2(typ, op, arrow.GetBytes(arg), arrow.GetBytes(out), len(arg)) return nil }) case OpSign: - inType, outType := exec.GetType[InT](), exec.GetType[OutT]() + inType, outType := arrow.GetType[InT](), arrow.GetType[OutT]() return ScalarUnary(func(_ *exec.KernelCtx, arg []InT, out []OutT) error { - arithmeticUnaryDiffTypesAvx2(inType, outType, op, exec.GetBytes(arg), exec.GetBytes(out), len(arg)) + arithmeticUnaryDiffTypesAvx2(inType, outType, op, arrow.GetBytes(arg), arrow.GetBytes(out), len(arg)) return nil }) } @@ -86,15 +87,15 @@ func getArithmeticOpIntegral[InT, OutT exec.UintTypes | exec.IntTypes](op Arithm case OpAdd, OpSub, OpMul: return ScalarBinary(getSSE4ArithmeticBinaryNumeric[InT](op)) case OpAbsoluteValue, OpNegate: - typ := exec.GetType[InT]() + typ := arrow.GetType[InT]() return ScalarUnary(func(ctx *exec.KernelCtx, arg, out []InT) error { - arithmeticUnarySSE4(typ, op, exec.GetBytes(arg), exec.GetBytes(out), len(arg)) + arithmeticUnarySSE4(typ, op, arrow.GetBytes(arg), arrow.GetBytes(out), len(arg)) return nil }) case OpSign: - inType, outType := exec.GetType[InT](), exec.GetType[OutT]() + inType, outType := arrow.GetType[InT](), arrow.GetType[OutT]() return ScalarUnary(func(_ *exec.KernelCtx, arg []InT, out []OutT) error { - arithmeticUnaryDiffTypesSSE4(inType, outType, op, exec.GetBytes(arg), exec.GetBytes(out), len(arg)) + arithmeticUnaryDiffTypesSSE4(inType, outType, op, arrow.GetBytes(arg), arrow.GetBytes(out), len(arg)) return nil }) } @@ -109,38 +110,38 @@ func getArithmeticOpFloating[InT, OutT constraints.Float](op ArithmeticOp) exec. if cpu.X86.HasAVX2 { switch op { case OpAdd, OpSub, OpAddChecked, OpSubChecked, OpMul, OpMulChecked: - if exec.GetType[InT]() != exec.GetType[OutT]() { + if arrow.GetType[InT]() != arrow.GetType[OutT]() { debug.Assert(false, "not implemented") return nil } return ScalarBinary(getAvx2ArithmeticBinaryNumeric[InT](op)) case OpAbsoluteValue, OpAbsoluteValueChecked, OpNegate, OpNegateChecked, OpSign: - if exec.GetType[InT]() != exec.GetType[OutT]() { + if arrow.GetType[InT]() != arrow.GetType[OutT]() { debug.Assert(false, "not implemented") return nil } - typ := exec.GetType[InT]() + typ := arrow.GetType[InT]() return ScalarUnary(func(_ *exec.KernelCtx, arg, out []InT) error { - arithmeticUnaryAvx2(typ, op, exec.GetBytes(arg), exec.GetBytes(out), len(arg)) + arithmeticUnaryAvx2(typ, op, arrow.GetBytes(arg), arrow.GetBytes(out), len(arg)) return nil }) } } else if cpu.X86.HasSSE42 { switch op { case OpAdd, OpSub, OpAddChecked, OpSubChecked, OpMul, OpMulChecked: - if exec.GetType[InT]() != exec.GetType[OutT]() { + if arrow.GetType[InT]() != arrow.GetType[OutT]() { debug.Assert(false, "not implemented") return nil } return ScalarBinary(getSSE4ArithmeticBinaryNumeric[InT](op)) case OpAbsoluteValue, OpAbsoluteValueChecked, OpNegate, OpNegateChecked, OpSign: - if exec.GetType[InT]() != exec.GetType[OutT]() { + if arrow.GetType[InT]() != arrow.GetType[OutT]() { debug.Assert(false, "not implemented") return nil } - typ := exec.GetType[InT]() + typ := arrow.GetType[InT]() return ScalarUnary(func(_ *exec.KernelCtx, arg, out []InT) error { - arithmeticUnarySSE4(typ, op, exec.GetBytes(arg), exec.GetBytes(out), len(arg)) + arithmeticUnarySSE4(typ, op, arrow.GetBytes(arg), arrow.GetBytes(out), len(arg)) return nil }) } diff --git a/go/arrow/compute/internal/kernels/basic_arithmetic_noasm.go b/go/arrow/compute/internal/kernels/basic_arithmetic_noasm.go index 8e46ca030c8b7..2c1559fe0f0fd 100644 --- a/go/arrow/compute/internal/kernels/basic_arithmetic_noasm.go +++ b/go/arrow/compute/internal/kernels/basic_arithmetic_noasm.go @@ -19,6 +19,7 @@ package kernels import ( + "github.com/apache/arrow/go/v15/arrow" "github.com/apache/arrow/go/v15/arrow/compute/exec" "golang.org/x/exp/constraints" ) @@ -27,6 +28,6 @@ func getArithmeticOpFloating[InT, OutT constraints.Float](op ArithmeticOp) exec. return getGoArithmeticOpFloating[InT, OutT](op) } -func getArithmeticOpIntegral[InT, OutT exec.UintTypes | exec.IntTypes](op ArithmeticOp) exec.ArrayKernelExec { +func getArithmeticOpIntegral[InT, OutT arrow.UintType | arrow.IntType](op ArithmeticOp) exec.ArrayKernelExec { return getGoArithmeticOpIntegral[InT, OutT](op) } diff --git a/go/arrow/compute/internal/kernels/boolean_cast.go b/go/arrow/compute/internal/kernels/boolean_cast.go index 923c5b3f54512..6109d25790940 100644 --- a/go/arrow/compute/internal/kernels/boolean_cast.go +++ b/go/arrow/compute/internal/kernels/boolean_cast.go @@ -27,7 +27,7 @@ import ( "github.com/apache/arrow/go/v15/arrow/compute/exec" ) -func isNonZero[T exec.FixedWidthTypes](ctx *exec.KernelCtx, in []T, out []byte) error { +func isNonZero[T arrow.FixedWidthType](ctx *exec.KernelCtx, in []T, out []byte) error { var zero T for i, v := range in { bitutil.SetBitTo(out, i, v != zero) diff --git a/go/arrow/compute/internal/kernels/helpers.go b/go/arrow/compute/internal/kernels/helpers.go index 686c4b3e0c29a..1ac09ba43bfb5 100644 --- a/go/arrow/compute/internal/kernels/helpers.go +++ b/go/arrow/compute/internal/kernels/helpers.go @@ -37,9 +37,9 @@ import ( // which will receive a slice containing the raw input data along with // a slice to populate for the output data. // -// Note that bool is not included in exec.FixedWidthTypes since it is +// Note that bool is not included in arrow.FixedWidthType since it is // represented as a bitmap, not as a slice of bool. -func ScalarUnary[OutT, Arg0T exec.FixedWidthTypes](op func(*exec.KernelCtx, []Arg0T, []OutT) error) exec.ArrayKernelExec { +func ScalarUnary[OutT, Arg0T arrow.FixedWidthType](op func(*exec.KernelCtx, []Arg0T, []OutT) error) exec.ArrayKernelExec { return func(ctx *exec.KernelCtx, in *exec.ExecSpan, out *exec.ExecResult) error { arg0 := in.Values[0].Array inData := exec.GetSpanValues[Arg0T](&arg0, 1) @@ -51,7 +51,7 @@ func ScalarUnary[OutT, Arg0T exec.FixedWidthTypes](op func(*exec.KernelCtx, []Ar // ScalarUnaryNotNull is for generating a kernel to operate only on the // non-null values in the input array. The zerovalue of the output type // is used for any null input values. -func ScalarUnaryNotNull[OutT, Arg0T exec.FixedWidthTypes](op func(*exec.KernelCtx, Arg0T, *error) OutT) exec.ArrayKernelExec { +func ScalarUnaryNotNull[OutT, Arg0T arrow.FixedWidthType](op func(*exec.KernelCtx, Arg0T, *error) OutT) exec.ArrayKernelExec { return func(ctx *exec.KernelCtx, in *exec.ExecSpan, out *exec.ExecResult) error { var ( arg0 = &in.Values[0].Array @@ -78,7 +78,7 @@ func ScalarUnaryNotNull[OutT, Arg0T exec.FixedWidthTypes](op func(*exec.KernelCt // ScalarUnaryBoolOutput is like ScalarUnary only it is for cases of boolean // output. The function should take in a slice of the input type and a slice // of bytes to fill with the output boolean bitmap. -func ScalarUnaryBoolOutput[Arg0T exec.FixedWidthTypes](op func(*exec.KernelCtx, []Arg0T, []byte) error) exec.ArrayKernelExec { +func ScalarUnaryBoolOutput[Arg0T arrow.FixedWidthType](op func(*exec.KernelCtx, []Arg0T, []byte) error) exec.ArrayKernelExec { return func(ctx *exec.KernelCtx, in *exec.ExecSpan, out *exec.ExecResult) error { arg0 := in.Values[0].Array inData := exec.GetSpanValues[Arg0T](&arg0, 1) @@ -127,7 +127,7 @@ func ScalarUnaryNotNullBinaryArgBoolOut[OffsetT int32 | int64](defVal bool, op f // It implements the handling to iterate the offsets and values calling // the provided function on each byte slice. The zero value of the OutT // will be used as the output for elements of the input that are null. -func ScalarUnaryNotNullBinaryArg[OutT exec.FixedWidthTypes, OffsetT int32 | int64](op func(*exec.KernelCtx, []byte, *error) OutT) exec.ArrayKernelExec { +func ScalarUnaryNotNullBinaryArg[OutT arrow.FixedWidthType, OffsetT int32 | int64](op func(*exec.KernelCtx, []byte, *error) OutT) exec.ArrayKernelExec { return func(ctx *exec.KernelCtx, in *exec.ExecSpan, out *exec.ExecResult) error { var ( arg0 = &in.Values[0].Array @@ -156,14 +156,14 @@ func ScalarUnaryNotNullBinaryArg[OutT exec.FixedWidthTypes, OffsetT int32 | int6 // ScalarUnaryBoolArg is like ScalarUnary except it specifically expects a // function that takes a byte slice since booleans arrays are represented // as a bitmap. -func ScalarUnaryBoolArg[OutT exec.FixedWidthTypes](op func(*exec.KernelCtx, []byte, []OutT) error) exec.ArrayKernelExec { +func ScalarUnaryBoolArg[OutT arrow.FixedWidthType](op func(*exec.KernelCtx, []byte, []OutT) error) exec.ArrayKernelExec { return func(ctx *exec.KernelCtx, input *exec.ExecSpan, out *exec.ExecResult) error { outData := exec.GetSpanValues[OutT](out, 1) return op(ctx, input.Values[0].Array.Buffers[1].Buf, outData) } } -func UnboxScalar[T exec.FixedWidthTypes](val scalar.PrimitiveScalar) T { +func UnboxScalar[T arrow.FixedWidthType](val scalar.PrimitiveScalar) T { return *(*T)(unsafe.Pointer(&val.Data()[0])) } @@ -174,11 +174,11 @@ func UnboxBinaryScalar(val scalar.BinaryScalar) []byte { return val.Data() } -type arrArrFn[OutT, Arg0T, Arg1T exec.FixedWidthTypes] func(*exec.KernelCtx, []Arg0T, []Arg1T, []OutT) error -type arrScalarFn[OutT, Arg0T, Arg1T exec.FixedWidthTypes] func(*exec.KernelCtx, []Arg0T, Arg1T, []OutT) error -type scalarArrFn[OutT, Arg0T, Arg1T exec.FixedWidthTypes] func(*exec.KernelCtx, Arg0T, []Arg1T, []OutT) error +type arrArrFn[OutT, Arg0T, Arg1T arrow.FixedWidthType] func(*exec.KernelCtx, []Arg0T, []Arg1T, []OutT) error +type arrScalarFn[OutT, Arg0T, Arg1T arrow.FixedWidthType] func(*exec.KernelCtx, []Arg0T, Arg1T, []OutT) error +type scalarArrFn[OutT, Arg0T, Arg1T arrow.FixedWidthType] func(*exec.KernelCtx, Arg0T, []Arg1T, []OutT) error -type binaryOps[OutT, Arg0T, Arg1T exec.FixedWidthTypes] struct { +type binaryOps[OutT, Arg0T, Arg1T arrow.FixedWidthType] struct { arrArr arrArrFn[OutT, Arg0T, Arg1T] arrScalar arrScalarFn[OutT, Arg0T, Arg1T] scalarArr scalarArrFn[OutT, Arg0T, Arg1T] @@ -190,7 +190,7 @@ type binaryBoolOps struct { scalarArr func(ctx *exec.KernelCtx, lhs bool, rhs, out bitutil.Bitmap) error } -func ScalarBinary[OutT, Arg0T, Arg1T exec.FixedWidthTypes](ops binaryOps[OutT, Arg0T, Arg1T]) exec.ArrayKernelExec { +func ScalarBinary[OutT, Arg0T, Arg1T arrow.FixedWidthType](ops binaryOps[OutT, Arg0T, Arg1T]) exec.ArrayKernelExec { arrayArray := func(ctx *exec.KernelCtx, arg0, arg1 *exec.ArraySpan, out *exec.ExecResult) error { var ( a0 = exec.GetSpanValues[Arg0T](arg0, 1) @@ -281,7 +281,7 @@ func ScalarBinaryBools(ops *binaryBoolOps) exec.ArrayKernelExec { } } -func ScalarBinaryNotNull[OutT, Arg0T, Arg1T exec.FixedWidthTypes](op func(*exec.KernelCtx, Arg0T, Arg1T, *error) OutT) exec.ArrayKernelExec { +func ScalarBinaryNotNull[OutT, Arg0T, Arg1T arrow.FixedWidthType](op func(*exec.KernelCtx, Arg0T, Arg1T, *error) OutT) exec.ArrayKernelExec { arrayArray := func(ctx *exec.KernelCtx, arg0, arg1 *exec.ArraySpan, out *exec.ExecResult) (err error) { // fast path if one side is entirely null if arg0.UpdateNullCount() == arg0.Len || arg1.UpdateNullCount() == arg1.Len { @@ -379,7 +379,7 @@ func ScalarBinaryNotNull[OutT, Arg0T, Arg1T exec.FixedWidthTypes](op func(*exec. } } -type binaryBinOp[T exec.FixedWidthTypes | bool] func(ctx *exec.KernelCtx, arg0, arg1 []byte) T +type binaryBinOp[T arrow.FixedWidthType | bool] func(ctx *exec.KernelCtx, arg0, arg1 []byte) T func ScalarBinaryBinaryArgsBoolOut(itrFn func(*exec.ArraySpan) exec.ArrayIter[[]byte], op binaryBinOp[bool]) exec.ArrayKernelExec { arrArr := func(ctx *exec.KernelCtx, arg0, arg1 *exec.ArraySpan, out *exec.ExecResult) error { @@ -577,7 +577,7 @@ func intsCanFit(data *exec.ArraySpan, target arrow.Type) error { } } -func intsInRange[T exec.IntTypes | exec.UintTypes](data *exec.ArraySpan, lowerBound, upperBound T) error { +func intsInRange[T arrow.IntType | arrow.UintType](data *exec.ArraySpan, lowerBound, upperBound T) error { if MinOf[T]() >= lowerBound && MaxOf[T]() <= upperBound { return nil } @@ -653,7 +653,7 @@ func intsInRange[T exec.IntTypes | exec.UintTypes](data *exec.ArraySpan, lowerBo } type numeric interface { - exec.IntTypes | exec.UintTypes | constraints.Float + arrow.IntType | arrow.UintType | constraints.Float } func memCpySpan[T numeric](in, out *exec.ArraySpan) { @@ -883,12 +883,12 @@ func (bldr *execBufBuilder) finish() (buf *memory.Buffer) { return } -type bufferBuilder[T exec.FixedWidthTypes] struct { +type bufferBuilder[T arrow.FixedWidthType] struct { execBufBuilder zero T } -func newBufferBuilder[T exec.FixedWidthTypes](mem memory.Allocator) *bufferBuilder[T] { +func newBufferBuilder[T arrow.FixedWidthType](mem memory.Allocator) *bufferBuilder[T] { return &bufferBuilder[T]{ execBufBuilder: execBufBuilder{ mem: mem, @@ -901,11 +901,11 @@ func (b *bufferBuilder[T]) reserve(additional int) { } func (b *bufferBuilder[T]) unsafeAppend(value T) { - b.execBufBuilder.unsafeAppend(exec.GetBytes([]T{value})) + b.execBufBuilder.unsafeAppend(arrow.GetBytes([]T{value})) } func (b *bufferBuilder[T]) unsafeAppendSlice(values []T) { - b.execBufBuilder.unsafeAppend(exec.GetBytes(values)) + b.execBufBuilder.unsafeAppend(arrow.GetBytes(values)) } func (b *bufferBuilder[T]) len() int { return b.sz / int(unsafe.Sizeof(b.zero)) } @@ -914,7 +914,7 @@ func (b *bufferBuilder[T]) cap() int { return cap(b.data) / int(unsafe.Sizeof(b.zero)) } -func checkIndexBoundsImpl[T exec.IntTypes | exec.UintTypes](values *exec.ArraySpan, upperLimit uint64) error { +func checkIndexBoundsImpl[T arrow.IntType | arrow.UintType](values *exec.ArraySpan, upperLimit uint64) error { // for unsigned integers, if the values array is larger // than the maximum index value, then there's no need to bounds check isSigned := !arrow.IsUnsignedInteger(values.Type.ID()) diff --git a/go/arrow/compute/internal/kernels/numeric_cast.go b/go/arrow/compute/internal/kernels/numeric_cast.go index c055552bf7ff5..d31edfdd3087c 100644 --- a/go/arrow/compute/internal/kernels/numeric_cast.go +++ b/go/arrow/compute/internal/kernels/numeric_cast.go @@ -69,13 +69,13 @@ func CastIntegerToFloating(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec. return nil } -type decimal[T exec.DecimalTypes] interface { +type decimal[T decimal128.Num | decimal256.Num] interface { Less(T) bool GreaterEqual(T) bool LowBits() uint64 } -func decimalToIntImpl[InT exec.DecimalTypes, OutT exec.IntTypes | exec.UintTypes](allowOverflow bool, min, max InT, v decimal[InT], err *error) OutT { +func decimalToIntImpl[InT decimal128.Num | decimal256.Num, OutT arrow.IntType | arrow.UintType](allowOverflow bool, min, max InT, v decimal[InT], err *error) OutT { if !allowOverflow && (v.Less(min) || v.GreaterEqual(max)) { debug.Log("integer value out of bounds from decimal") *err = fmt.Errorf("%w: integer value out of bounds", arrow.ErrInvalid) @@ -84,7 +84,7 @@ func decimalToIntImpl[InT exec.DecimalTypes, OutT exec.IntTypes | exec.UintTypes return OutT(v.LowBits()) } -func CastDecimal256ToInteger[T exec.IntTypes | exec.UintTypes](ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error { +func CastDecimal256ToInteger[T arrow.IntType | arrow.UintType](ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error { var ( opts = ctx.State.(CastState) inputType = batch.Values[0].Type().(*arrow.Decimal256Type) @@ -125,7 +125,7 @@ func CastDecimal256ToInteger[T exec.IntTypes | exec.UintTypes](ctx *exec.KernelC return ex(ctx, batch, out) } -func CastDecimal128ToInteger[T exec.IntTypes | exec.UintTypes](ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error { +func CastDecimal128ToInteger[T arrow.IntType | arrow.UintType](ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error { var ( opts = ctx.State.(CastState) inputType = batch.Values[0].Type().(*arrow.Decimal128Type) @@ -166,7 +166,7 @@ func CastDecimal128ToInteger[T exec.IntTypes | exec.UintTypes](ctx *exec.KernelC return ex(ctx, batch, out) } -func integerToDecimal128[T exec.IntTypes | exec.UintTypes](inType arrow.Type, outScale int32) exec.ArrayKernelExec { +func integerToDecimal128[T arrow.IntType | arrow.UintType](inType arrow.Type, outScale int32) exec.ArrayKernelExec { var getDecimal func(v T) decimal128.Num switch inType { case arrow.UINT8, arrow.UINT16, arrow.UINT32, arrow.UINT64: @@ -183,7 +183,7 @@ func integerToDecimal128[T exec.IntTypes | exec.UintTypes](inType arrow.Type, ou }) } -func integerToDecimal256[T exec.IntTypes | exec.UintTypes](inType arrow.Type, outScale int32) exec.ArrayKernelExec { +func integerToDecimal256[T arrow.IntType | arrow.UintType](inType arrow.Type, outScale int32) exec.ArrayKernelExec { var getDecimal func(v T) decimal256.Num switch inType { case arrow.UINT8, arrow.UINT16, arrow.UINT32, arrow.UINT64: @@ -200,7 +200,7 @@ func integerToDecimal256[T exec.IntTypes | exec.UintTypes](inType arrow.Type, ou }) } -func CastIntegerToDecimal[OutT exec.DecimalTypes, Arg0 exec.IntTypes | exec.UintTypes](ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error { +func CastIntegerToDecimal[OutT decimal128.Num | decimal256.Num, Arg0 arrow.IntType | arrow.UintType](ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error { var ( precision, scale int32 executor exec.ArrayKernelExec @@ -234,7 +234,7 @@ func CastIntegerToDecimal[OutT exec.DecimalTypes, Arg0 exec.IntTypes | exec.Uint return executor(ctx, batch, out) } -func getCastIntToDecimal[T exec.DecimalTypes](inType arrow.Type) exec.ArrayKernelExec { +func getCastIntToDecimal[T decimal128.Num | decimal256.Num](inType arrow.Type) exec.ArrayKernelExec { switch inType { case arrow.UINT8: return CastIntegerToDecimal[T, uint8] @@ -543,7 +543,7 @@ func boolToNum[T numeric](_ *exec.KernelCtx, in []byte, out []T) error { return nil } -func checkFloatTrunc[InT constraints.Float, OutT exec.IntTypes | exec.UintTypes](in, out *exec.ArraySpan) error { +func checkFloatTrunc[InT constraints.Float, OutT arrow.IntType | arrow.UintType](in, out *exec.ArraySpan) error { wasTrunc := func(out OutT, in InT) bool { return InT(out) != in } @@ -665,7 +665,7 @@ func checkIntToFloatTrunc(in *exec.ArraySpan, outType arrow.Type) error { return nil } -func parseStringToNumberImpl[T exec.IntTypes | exec.UintTypes | exec.FloatTypes, OffsetT int32 | int64](parseFn func(string) (T, error)) exec.ArrayKernelExec { +func parseStringToNumberImpl[T arrow.IntType | arrow.UintType | arrow.FloatType, OffsetT int32 | int64](parseFn func(string) (T, error)) exec.ArrayKernelExec { return ScalarUnaryNotNullBinaryArg[T, OffsetT](func(_ *exec.KernelCtx, in []byte, err *error) T { st := *(*string)(unsafe.Pointer(&in)) v, e := parseFn(st) @@ -749,7 +749,7 @@ func addCommonNumberCasts[T numeric](outTy arrow.DataType, kernels []exec.Scalar return kernels } -func GetCastToInteger[T exec.IntTypes | exec.UintTypes](outType arrow.DataType) []exec.ScalarKernel { +func GetCastToInteger[T arrow.IntType | arrow.UintType](outType arrow.DataType) []exec.ScalarKernel { kernels := make([]exec.ScalarKernel, 0) output := exec.NewOutputType(outType) diff --git a/go/arrow/compute/internal/kernels/scalar_arithmetic.go b/go/arrow/compute/internal/kernels/scalar_arithmetic.go index cf17e9fd9548b..f1ed21065e404 100644 --- a/go/arrow/compute/internal/kernels/scalar_arithmetic.go +++ b/go/arrow/compute/internal/kernels/scalar_arithmetic.go @@ -254,7 +254,7 @@ func GetBitwiseBinaryKernels(op BitwiseOp) []exec.ScalarKernel { return append(kernels, NullExecKernel(2)) } -func bitwiseNot[T exec.IntTypes | exec.UintTypes](_ *exec.KernelCtx, arg T, _ *error) T { +func bitwiseNot[T arrow.IntType | arrow.UintType](_ *exec.KernelCtx, arg T, _ *error) T { return ^arg } @@ -290,7 +290,7 @@ const ( ShiftRight ) -func shiftKernelSignedImpl[T exec.IntTypes, Unsigned exec.UintTypes](dir ShiftDir, checked bool) exec.ArrayKernelExec { +func shiftKernelSignedImpl[T arrow.IntType, Unsigned arrow.UintType](dir ShiftDir, checked bool) exec.ArrayKernelExec { errShift := fmt.Errorf("%w: shift amount must be >= 0 and less than precision of type", arrow.ErrInvalid) maxShift := T(8*SizeOf[T]() - 1) @@ -334,7 +334,7 @@ func shiftKernelSignedImpl[T exec.IntTypes, Unsigned exec.UintTypes](dir ShiftDi return nil } -func shiftKernelUnsignedImpl[T exec.UintTypes](dir ShiftDir, checked bool) exec.ArrayKernelExec { +func shiftKernelUnsignedImpl[T arrow.UintType](dir ShiftDir, checked bool) exec.ArrayKernelExec { errShift := fmt.Errorf("%w: shift amount must be >= 0 and less than precision of type", arrow.ErrInvalid) maxShift := T(8 * SizeOf[T]()) diff --git a/go/arrow/compute/internal/kernels/scalar_comparison_amd64.go b/go/arrow/compute/internal/kernels/scalar_comparison_amd64.go index 8e5ce1ab7c1ad..52cd2c31a2aa4 100644 --- a/go/arrow/compute/internal/kernels/scalar_comparison_amd64.go +++ b/go/arrow/compute/internal/kernels/scalar_comparison_amd64.go @@ -22,7 +22,6 @@ import ( "unsafe" "github.com/apache/arrow/go/v15/arrow" - "github.com/apache/arrow/go/v15/arrow/compute/exec" "golang.org/x/sys/cpu" ) @@ -32,12 +31,12 @@ type cmpfn func(arrow.Type, []byte, []byte, []byte, int64, int) var comparisonMap map[CompareOperator][3]cmpfn -func genCompareKernel[T exec.NumericTypes](op CompareOperator) *CompareData { +func genCompareKernel[T arrow.NumericType](op CompareOperator) *CompareData { if pureGo { return genGoCompareKernel(getCmpOp[T](op)) } - ty := exec.GetType[T]() + ty := arrow.GetType[T]() byteWidth := int(unsafe.Sizeof(T(0))) comparisonFns := comparisonMap[op] return &CompareData{ diff --git a/go/arrow/compute/internal/kernels/scalar_comparison_noasm.go b/go/arrow/compute/internal/kernels/scalar_comparison_noasm.go index c0aef5a04e9b8..b36524baa126b 100644 --- a/go/arrow/compute/internal/kernels/scalar_comparison_noasm.go +++ b/go/arrow/compute/internal/kernels/scalar_comparison_noasm.go @@ -18,8 +18,8 @@ package kernels -import "github.com/apache/arrow/go/v15/arrow/compute/exec" +import "github.com/apache/arrow/go/v15/arrow" -func genCompareKernel[T exec.NumericTypes](op CompareOperator) *CompareData { +func genCompareKernel[T arrow.NumericType](op CompareOperator) *CompareData { return genGoCompareKernel(getCmpOp[T](op)) } diff --git a/go/arrow/compute/internal/kernels/scalar_comparisons.go b/go/arrow/compute/internal/kernels/scalar_comparisons.go index 9a7640a8d8a39..29e6db29cb267 100644 --- a/go/arrow/compute/internal/kernels/scalar_comparisons.go +++ b/go/arrow/compute/internal/kernels/scalar_comparisons.go @@ -35,22 +35,22 @@ import ( type binaryKernel func(left, right, out []byte, offset int) -type cmpFn[LeftT, RightT exec.FixedWidthTypes] func([]LeftT, []RightT, []uint32) -type cmpScalarLeft[LeftT, RightT exec.FixedWidthTypes] func(LeftT, []RightT, []uint32) -type cmpScalarRight[LeftT, RightT exec.FixedWidthTypes] func([]LeftT, RightT, []uint32) +type cmpFn[LeftT, RightT arrow.FixedWidthType] func([]LeftT, []RightT, []uint32) +type cmpScalarLeft[LeftT, RightT arrow.FixedWidthType] func(LeftT, []RightT, []uint32) +type cmpScalarRight[LeftT, RightT arrow.FixedWidthType] func([]LeftT, RightT, []uint32) -type cmpOp[T exec.FixedWidthTypes] struct { +type cmpOp[T arrow.FixedWidthType] struct { arrArr cmpFn[T, T] arrScalar cmpScalarRight[T, T] scalarArr cmpScalarLeft[T, T] } -func comparePrimitiveArrayArray[T exec.FixedWidthTypes](op cmpFn[T, T]) binaryKernel { +func comparePrimitiveArrayArray[T arrow.FixedWidthType](op cmpFn[T, T]) binaryKernel { return func(leftBytes, rightBytes, out []byte, offset int) { const batchSize = 32 var ( - left = exec.GetData[T](leftBytes) - right = exec.GetData[T](rightBytes) + left = arrow.GetData[T](leftBytes) + right = arrow.GetData[T](rightBytes) nvals = len(left) nbatches = nvals / batchSize tmpOutput [batchSize]uint32 @@ -83,11 +83,11 @@ func comparePrimitiveArrayArray[T exec.FixedWidthTypes](op cmpFn[T, T]) binaryKe } } -func comparePrimitiveArrayScalar[T exec.FixedWidthTypes](op cmpScalarRight[T, T]) binaryKernel { +func comparePrimitiveArrayScalar[T arrow.FixedWidthType](op cmpScalarRight[T, T]) binaryKernel { return func(leftBytes, rightBytes, out []byte, offset int) { const batchSize = 32 var ( - left = exec.GetData[T](leftBytes) + left = arrow.GetData[T](leftBytes) rightVal = *(*T)(unsafe.Pointer(&rightBytes[0])) nvals = len(left) nbatches = nvals / batchSize @@ -121,12 +121,12 @@ func comparePrimitiveArrayScalar[T exec.FixedWidthTypes](op cmpScalarRight[T, T] } } -func comparePrimitiveScalarArray[T exec.FixedWidthTypes](op cmpScalarLeft[T, T]) binaryKernel { +func comparePrimitiveScalarArray[T arrow.FixedWidthType](op cmpScalarLeft[T, T]) binaryKernel { return func(leftBytes, rightBytes, out []byte, offset int) { const batchSize = 32 var ( leftVal = *(*T)(unsafe.Pointer(&leftBytes[0])) - right = exec.GetData[T](rightBytes) + right = arrow.GetData[T](rightBytes) nvals = len(right) nbatches = nvals / batchSize @@ -181,7 +181,7 @@ func getOffsetSpanBytes(span *exec.ArraySpan) []byte { return buf[start : start+(span.Len*byteWidth)] } -func compareKernel[T exec.FixedWidthTypes](ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error { +func compareKernel[T arrow.FixedWidthType](ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error { kn := ctx.Kernel.(*exec.ScalarKernel) knData := kn.Data.(CompareFuncData).Funcs() @@ -202,7 +202,7 @@ func compareKernel[T exec.FixedWidthTypes](ctx *exec.KernelCtx, batch *exec.Exec return nil } -func genGoCompareKernel[T exec.FixedWidthTypes](op *cmpOp[T]) *CompareData { +func genGoCompareKernel[T arrow.FixedWidthType](op *cmpOp[T]) *CompareData { return &CompareData{ funcAA: comparePrimitiveArrayArray(op.arrArr), funcAS: comparePrimitiveArrayScalar(op.arrScalar), @@ -376,7 +376,7 @@ func genDecimalCompareKernel[T decimal128.Num | decimal256.Num](op CompareOperat return } -func getCmpOp[T exec.NumericTypes](op CompareOperator) *cmpOp[T] { +func getCmpOp[T arrow.NumericType](op CompareOperator) *cmpOp[T] { switch op { case CmpEQ: return &cmpOp[T]{ @@ -524,7 +524,7 @@ func getBinaryCmp(op CompareOperator) binaryBinOp[bool] { return nil } -func numericCompareKernel[T exec.NumericTypes](ty exec.InputType, op CompareOperator) (kn exec.ScalarKernel) { +func numericCompareKernel[T arrow.NumericType](ty exec.InputType, op CompareOperator) (kn exec.ScalarKernel) { ex := compareKernel[T] kn = exec.NewScalarKernelWithSig(&exec.KernelSignature{ InputTypes: []exec.InputType{ty, ty}, diff --git a/go/arrow/compute/internal/kernels/string_casts.go b/go/arrow/compute/internal/kernels/string_casts.go index 76da901e33f8d..d9cf52320b3aa 100644 --- a/go/arrow/compute/internal/kernels/string_casts.go +++ b/go/arrow/compute/internal/kernels/string_casts.go @@ -116,7 +116,7 @@ func CastBinaryToBinary[InOffsetsT, OutOffsetsT int32 | int64](ctx *exec.KernelC outOffsets := exec.GetSpanOffsets[OutOffsetsT](out, 1) castNumericUnsafe(arrow.INT64, arrow.INT32, - exec.GetBytes(inputOffsets), exec.GetBytes(outOffsets), len(inputOffsets)) + arrow.GetBytes(inputOffsets), arrow.GetBytes(outOffsets), len(inputOffsets)) return nil default: // upcast from int32 -> int64 @@ -127,7 +127,7 @@ func CastBinaryToBinary[InOffsetsT, OutOffsetsT int32 | int64](ctx *exec.KernelC outOffsets := exec.GetSpanOffsets[OutOffsetsT](out, 1) castNumericUnsafe(arrow.INT32, arrow.INT64, - exec.GetBytes(inputOffsets), exec.GetBytes(outOffsets), len(inputOffsets)) + arrow.GetBytes(inputOffsets), arrow.GetBytes(outOffsets), len(inputOffsets)) return nil } } @@ -201,8 +201,8 @@ func GetFsbCastKernels() []exec.ScalarKernel { func float16Formatter(v float16.Num) string { return v.String() } func date32Formatter(v arrow.Date32) string { return v.FormattedString() } func date64Formatter(v arrow.Date64) string { return v.FormattedString() } -func numericFormatterSigned[T exec.IntTypes](v T) string { return strconv.FormatInt(int64(v), 10) } -func numericFormatterUnsigned[T exec.UintTypes](v T) string { return strconv.FormatUint(uint64(v), 10) } +func numericFormatterSigned[T arrow.IntType](v T) string { return strconv.FormatInt(int64(v), 10) } +func numericFormatterUnsigned[T arrow.UintType](v T) string { return strconv.FormatUint(uint64(v), 10) } func float32Formatter(v float32) string { return strconv.FormatFloat(float64(v), 'g', -1, 32) } func float64Formatter(v float64) string { return strconv.FormatFloat(v, 'g', -1, 64) } @@ -247,7 +247,7 @@ func timeToStringCastExec[T timeIntrinsic](ctx *exec.KernelCtx, batch *exec.Exec return nil } -func numericToStringCastExec[T exec.IntTypes | exec.UintTypes | exec.FloatTypes](formatter func(T) string) exec.ArrayKernelExec { +func numericToStringCastExec[T arrow.IntType | arrow.UintType | arrow.FloatType](formatter func(T) string) exec.ArrayKernelExec { return func(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error { var ( input = &batch.Values[0].Array diff --git a/go/arrow/compute/internal/kernels/vector_hash.go b/go/arrow/compute/internal/kernels/vector_hash.go index 9401e31cc5b09..f6c9a7f39db93 100644 --- a/go/arrow/compute/internal/kernels/vector_hash.go +++ b/go/arrow/compute/internal/kernels/vector_hash.go @@ -178,7 +178,7 @@ func doAppendFixedSize(action Action, memo hashing.MemoTable, arr *exec.ArraySpa }) } -func doAppendNumeric[T exec.IntTypes | exec.UintTypes | exec.FloatTypes](action Action, memo hashing.MemoTable, arr *exec.ArraySpan) error { +func doAppendNumeric[T arrow.IntType | arrow.UintType | arrow.FloatType](action Action, memo hashing.MemoTable, arr *exec.ArraySpan) error { arrData := exec.GetSpanValues[T](arr, 1) shouldEncodeNulls := action.ShouldEncodeNulls() return bitutils.VisitBitBlocksShort(arr.Buffers[0].Buf, arr.Offset, arr.Len, diff --git a/go/arrow/compute/internal/kernels/vector_run_end_encode.go b/go/arrow/compute/internal/kernels/vector_run_end_encode.go index 076bef1368438..017b9712025b7 100644 --- a/go/arrow/compute/internal/kernels/vector_run_end_encode.go +++ b/go/arrow/compute/internal/kernels/vector_run_end_encode.go @@ -46,18 +46,18 @@ type RunEndsType interface { int16 | int32 | int64 } -func readFixedWidthVal[V exec.FixedWidthTypes](inputValidity, inputValues []byte, offset int64, out *V) bool { +func readFixedWidthVal[V arrow.FixedWidthType](inputValidity, inputValues []byte, offset int64, out *V) bool { sz := int64(unsafe.Sizeof(*out)) *out = *(*V)(unsafe.Pointer(&inputValues[offset*sz])) return bitutil.BitIsSet(inputValidity, int(offset)) } -func writeFixedWidthVal[V exec.FixedWidthTypes](result *exec.ExecResult, offset int64, valid bool, value V) { +func writeFixedWidthVal[V arrow.FixedWidthType](result *exec.ExecResult, offset int64, valid bool, value V) { if len(result.Buffers[0].Buf) != 0 { bitutil.SetBitTo(result.Buffers[0].Buf, int(offset), valid) } - arr := exec.GetData[V](result.Buffers[1].Buf) + arr := arrow.GetData[V](result.Buffers[1].Buf) arr[offset] = value } @@ -73,7 +73,7 @@ func writeBoolVal(result *exec.ExecResult, offset int64, valid bool, value bool) bitutil.SetBitTo(result.Buffers[1].Buf, int(offset), value) } -type runEndEncodeLoopFixedWidth[R RunEndsType, V exec.FixedWidthTypes | bool] struct { +type runEndEncodeLoopFixedWidth[R RunEndsType, V arrow.FixedWidthType | bool] struct { inputLen, inputOffset int64 inputValidity []byte inputValues []byte @@ -84,7 +84,7 @@ type runEndEncodeLoopFixedWidth[R RunEndsType, V exec.FixedWidthTypes | bool] st } func (re *runEndEncodeLoopFixedWidth[R, V]) WriteEncodedRuns(out *exec.ExecResult) int64 { - outputRunEnds := exec.GetData[R](out.Children[0].Buffers[1].Buf) + outputRunEnds := arrow.GetData[R](out.Children[0].Buffers[1].Buf) readOffset := re.inputOffset var currentRun V @@ -155,7 +155,7 @@ func (re *runEndEncodeLoopFixedWidth[R, V]) PreallocOutput(ctx *exec.KernelCtx, valueBuffer = ctx.Allocate(int(numOutput) * bufSpec.ByteWidth) } - reeType := arrow.RunEndEncodedOf(exec.GetDataType[R](), re.valueType) + reeType := arrow.RunEndEncodedOf(arrow.GetDataType[R](), re.valueType) out.Release() *out = exec.ExecResult{ @@ -230,7 +230,7 @@ func (re *runEndEncodeFSB[R]) PreallocOutput(ctx *exec.KernelCtx, numOutput int6 } valueBuffer := ctx.Allocate(re.width * int(numOutput)) - reeType := arrow.RunEndEncodedOf(exec.GetDataType[R](), re.valueType) + reeType := arrow.RunEndEncodedOf(arrow.GetDataType[R](), re.valueType) out.Release() *out = exec.ExecResult{ @@ -258,7 +258,7 @@ func (re *runEndEncodeFSB[R]) PreallocOutput(ctx *exec.KernelCtx, numOutput int6 } func (re *runEndEncodeFSB[R]) WriteEncodedRuns(out *exec.ExecResult) int64 { - outputRunEnds := exec.GetData[R](out.Children[0].Buffers[1].Buf) + outputRunEnds := arrow.GetData[R](out.Children[0].Buffers[1].Buf) outputValues := out.Children[1].Buffers[1].Buf readOffset := re.inputOffset @@ -362,7 +362,7 @@ func (re *runEndEncodeLoopBinary[R, O]) PreallocOutput(ctx *exec.KernelCtx, numO valueBuffer := ctx.Allocate(int(re.estimatedValuesLen)) offsetsBuffer := ctx.Allocate(int(numOutput+1) * int(SizeOf[O]())) - reeType := arrow.RunEndEncodedOf(exec.GetDataType[R](), re.valueType) + reeType := arrow.RunEndEncodedOf(arrow.GetDataType[R](), re.valueType) *out = exec.ExecResult{ Type: reeType, Len: re.inputLen, @@ -389,7 +389,7 @@ func (re *runEndEncodeLoopBinary[R, O]) PreallocOutput(ctx *exec.KernelCtx, numO } func (re *runEndEncodeLoopBinary[R, O]) WriteEncodedRuns(out *exec.ExecResult) int64 { - outputRunEnds := exec.GetData[R](out.Children[0].Buffers[1].Buf) + outputRunEnds := arrow.GetData[R](out.Children[0].Buffers[1].Buf) outputOffsets := exec.GetSpanOffsets[O](&out.Children[1], 1) outputValues := out.Children[1].Buffers[2].Buf @@ -443,7 +443,7 @@ func validateRunEndType[R RunEndsType](length int64) error { return nil } -func createEncoder[R RunEndsType, V exec.FixedWidthTypes](input *exec.ArraySpan) *runEndEncodeLoopFixedWidth[R, V] { +func createEncoder[R RunEndsType, V arrow.FixedWidthType](input *exec.ArraySpan) *runEndEncodeLoopFixedWidth[R, V] { return &runEndEncodeLoopFixedWidth[R, V]{ inputLen: input.Len, inputOffset: input.Offset, @@ -539,7 +539,7 @@ func runEndEncodeImpl[R RunEndsType](ctx *exec.KernelCtx, batch *exec.ExecSpan, ) if inputLen == 0 { - reeType := arrow.RunEndEncodedOf(exec.GetDataType[R](), inputArr.Type) + reeType := arrow.RunEndEncodedOf(arrow.GetDataType[R](), inputArr.Type) *out = exec.ExecResult{ Type: reeType, Children: []exec.ArraySpan{ diff --git a/go/arrow/compute/internal/kernels/vector_selection.go b/go/arrow/compute/internal/kernels/vector_selection.go index 714e452325bfd..f08bb4100bf88 100644 --- a/go/arrow/compute/internal/kernels/vector_selection.go +++ b/go/arrow/compute/internal/kernels/vector_selection.go @@ -99,12 +99,12 @@ type builder[T any] interface { UnsafeAppendBoolToBitmap(bool) } -func getTakeIndices[T exec.IntTypes | exec.UintTypes](mem memory.Allocator, filter *exec.ArraySpan, nullSelect NullSelectionBehavior) arrow.ArrayData { +func getTakeIndices[T arrow.IntType | arrow.UintType](mem memory.Allocator, filter *exec.ArraySpan, nullSelect NullSelectionBehavior) arrow.ArrayData { var ( filterData = filter.Buffers[1].Buf haveFilterNulls = filter.MayHaveNulls() filterIsValid = filter.Buffers[0].Buf - idxType = exec.GetDataType[T]() + idxType = arrow.GetDataType[T]() ) if haveFilterNulls && nullSelect == EmitNulls { @@ -394,7 +394,7 @@ func primitiveFilterImpl(wr writeFiltered, values *exec.ArraySpan, filter *exec. } } -type filterWriter[T exec.UintTypes] struct { +type filterWriter[T arrow.UintType] struct { outPosition int outOffset int valuesOffset int @@ -519,7 +519,7 @@ func PrimitiveFilter(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecRe return nil } -type primitiveGetter[T exec.IntTypes | bool] interface { +type primitiveGetter[T arrow.IntType | bool] interface { IsValid(int64) bool GetValue(int64) T NullCount() int64 @@ -542,7 +542,7 @@ func (b *boolGetter) GetValue(i int64) bool { func (b *boolGetter) NullCount() int64 { return b.inner.Nulls } func (b *boolGetter) Len() int64 { return b.inner.Len } -type primitiveGetterImpl[T exec.IntTypes] struct { +type primitiveGetterImpl[T arrow.IntType] struct { inner *exec.ArraySpan values []T } @@ -608,7 +608,7 @@ func (c *chunkedBoolGetter) GetValue(i int64) bool { func (c *chunkedBoolGetter) NullCount() int64 { return c.nulls } func (c *chunkedBoolGetter) Len() int64 { return c.len } -type chunkedPrimitiveGetter[T exec.IntTypes] struct { +type chunkedPrimitiveGetter[T arrow.IntType] struct { inner *arrow.Chunked resolver *exec.ChunkResolver nulls int64 @@ -619,7 +619,7 @@ type chunkedPrimitiveGetter[T exec.IntTypes] struct { valuesOffset []int64 } -func newChunkedPrimitiveGetter[T exec.IntTypes](arr *arrow.Chunked) *chunkedPrimitiveGetter[T] { +func newChunkedPrimitiveGetter[T arrow.IntType](arr *arrow.Chunked) *chunkedPrimitiveGetter[T] { nchunks := len(arr.Chunks()) lengths := make([]int64, nchunks) valuesData := make([][]T, nchunks) @@ -630,7 +630,7 @@ func newChunkedPrimitiveGetter[T exec.IntTypes](arr *arrow.Chunked) *chunkedPrim lengths[i] = int64(c.Len()) valuesOffset[i] = int64(c.Data().Offset()) valuesIsValid[i] = c.NullBitmapBytes() - valuesData[i] = exec.GetValues[T](c.Data(), 1) + valuesData[i] = arrow.GetValues[T](c.Data(), 1) } return &chunkedPrimitiveGetter[T]{ @@ -662,7 +662,7 @@ func (c *chunkedPrimitiveGetter[T]) GetValue(i int64) T { func (c *chunkedPrimitiveGetter[T]) NullCount() int64 { return c.nulls } func (c *chunkedPrimitiveGetter[T]) Len() int64 { return c.len } -func primitiveTakeImpl[IdxT exec.UintTypes, ValT exec.IntTypes](values primitiveGetter[ValT], indices *exec.ArraySpan, out *exec.ExecResult) { +func primitiveTakeImpl[IdxT arrow.UintType, ValT arrow.IntType](values primitiveGetter[ValT], indices *exec.ArraySpan, out *exec.ExecResult) { var ( indicesData = exec.GetSpanValues[IdxT](indices, 1) indicesIsValid = indices.Buffers[0].Buf @@ -747,7 +747,7 @@ func primitiveTakeImpl[IdxT exec.UintTypes, ValT exec.IntTypes](values primitive out.Nulls = out.Len - validCount } -func booleanTakeImpl[IdxT exec.UintTypes](values primitiveGetter[bool], indices *exec.ArraySpan, out *exec.ExecResult) { +func booleanTakeImpl[IdxT arrow.UintType](values primitiveGetter[bool], indices *exec.ArraySpan, out *exec.ExecResult) { var ( indicesData = exec.GetSpanValues[IdxT](indices, 1) indicesIsValid = indices.Buffers[0].Buf @@ -876,7 +876,7 @@ func booleanTakeDispatch(values, indices *exec.ArraySpan, out *exec.ExecResult) return nil } -func takeIdxChunkedDispatch[ValT exec.IntTypes](values, indices *arrow.Chunked, out []*exec.ExecResult) error { +func takeIdxChunkedDispatch[ValT arrow.IntType](values, indices *arrow.Chunked, out []*exec.ExecResult) error { getter := newChunkedPrimitiveGetter[ValT](values) var fn func(primitiveGetter[ValT], *exec.ArraySpan, *exec.ExecResult) @@ -901,7 +901,7 @@ func takeIdxChunkedDispatch[ValT exec.IntTypes](values, indices *arrow.Chunked, return nil } -func takeIdxDispatch[ValT exec.IntTypes](values, indices *exec.ArraySpan, out *exec.ExecResult) error { +func takeIdxDispatch[ValT arrow.IntType](values, indices *exec.ArraySpan, out *exec.ExecResult) error { getter := &primitiveGetterImpl[ValT]{inner: values, values: exec.GetSpanValues[ValT](values, 1)} switch indices.Type.(arrow.FixedWidthDataType).Bytes() { @@ -1368,7 +1368,7 @@ func binaryFilterImpl[OffsetT int32 | int64](ctx *exec.KernelCtx, values, filter return nil } -func takeExecImpl[T exec.UintTypes](ctx *exec.KernelCtx, outputLen int64, values, indices *exec.ArraySpan, out *exec.ExecResult, visitValid func(int64) error, visitNull func() error) error { +func takeExecImpl[T arrow.UintType](ctx *exec.KernelCtx, outputLen int64, values, indices *exec.ArraySpan, out *exec.ExecResult, visitValid func(int64) error, visitNull func() error) error { var ( validityBuilder = validityBuilder{mem: exec.GetAllocator(ctx.Ctx)} indicesValues = exec.GetSpanValues[T](indices, 1) @@ -1600,7 +1600,7 @@ func ListImpl[OffsetT int32 | int64](ctx *exec.KernelCtx, batch *exec.ExecSpan, out.Buffers[1].WrapBuffer(offsetBuilder.finish()) out.Children = make([]exec.ArraySpan, 1) - out.Children[0].Type = exec.GetDataType[OffsetT]() + out.Children[0].Type = arrow.GetDataType[OffsetT]() out.Children[0].Len = int64(childIdxBuilder.len()) out.Children[0].Buffers[1].WrapBuffer(childIdxBuilder.finish()) diff --git a/go/arrow/compute/scalar_compare_test.go b/go/arrow/compute/scalar_compare_test.go index d209f72c800b0..1fa0591692ecb 100644 --- a/go/arrow/compute/scalar_compare_test.go +++ b/go/arrow/compute/scalar_compare_test.go @@ -89,7 +89,7 @@ func (c *CompareSuite) validateCompareScalarArr(op kernels.CompareOperator, dt a c.validateCompareDatum(op, lhs, &compute.ArrayDatum{rhs.Data()}, &compute.ArrayDatum{exp.Data()}) } -func slowCompare[T exec.NumericTypes | string](op kernels.CompareOperator, lhs, rhs T) bool { +func slowCompare[T arrow.NumericType | string](op kernels.CompareOperator, lhs, rhs T) bool { switch op { case kernels.CmpEQ: return lhs == rhs @@ -108,7 +108,7 @@ func slowCompare[T exec.NumericTypes | string](op kernels.CompareOperator, lhs, } } -// func simpleScalarArrayCompare[T exec.NumericTypes](mem memory.Allocator, op kernels.CompareOperator, lhs, rhs compute.Datum) compute.Datum { +// func simpleScalarArrayCompare[T arrow.NumericType](mem memory.Allocator, op kernels.CompareOperator, lhs, rhs compute.Datum) compute.Datum { // var ( // swap = lhs.Kind() == compute.KindArray // span exec.ArraySpan @@ -230,7 +230,7 @@ type valuer[T any] interface { Value(int) T } -func simpleArrArrCompare[T exec.NumericTypes | string](mem memory.Allocator, op kernels.CompareOperator, lhs, rhs compute.Datum) compute.Datum { +func simpleArrArrCompare[T arrow.NumericType | string](mem memory.Allocator, op kernels.CompareOperator, lhs, rhs compute.Datum) compute.Datum { var ( lArr = lhs.(*compute.ArrayDatum).MakeArray() rArr = rhs.(*compute.ArrayDatum).MakeArray() @@ -263,7 +263,7 @@ func simpleArrArrCompare[T exec.NumericTypes | string](mem memory.Allocator, op return compute.NewDatum(result) } -type NumericCompareSuite[T exec.NumericTypes] struct { +type NumericCompareSuite[T arrow.NumericType] struct { CompareSuite } @@ -282,7 +282,7 @@ type NumericCompareSuite[T exec.NumericTypes] struct { // } func (n *NumericCompareSuite[T]) TestSimpleCompareArrayScalar() { - dt := exec.GetDataType[T]() + dt := arrow.GetDataType[T]() one := compute.NewDatum(scalar.MakeScalar(T(1))) n.Run(dt.String(), func() { @@ -361,7 +361,7 @@ func (n *NumericCompareSuite[T]) TestSimpleCompareArrayScalar() { } func (n *NumericCompareSuite[T]) TestSimpleCompareScalarArray() { - dt := exec.GetDataType[T]() + dt := arrow.GetDataType[T]() one := compute.NewDatum(scalar.MakeScalar(T(1))) n.Run(dt.String(), func() { @@ -440,7 +440,7 @@ func (n *NumericCompareSuite[T]) TestSimpleCompareScalarArray() { } func (n *NumericCompareSuite[T]) TestNullScalar() { - dt := exec.GetDataType[T]() + dt := arrow.GetDataType[T]() null := compute.NewDatum(scalar.MakeNullScalar(dt)) n.Run(dt.String(), func() { @@ -453,7 +453,7 @@ func (n *NumericCompareSuite[T]) TestNullScalar() { } func (n *NumericCompareSuite[T]) TestSimpleCompareArrArr() { - dt := exec.GetDataType[T]() + dt := arrow.GetDataType[T]() n.Run(dt.String(), func() { n.validateCompare(kernels.CmpEQ, dt, `[]`, `[]`, `[]`) diff --git a/go/arrow/compute/vector_hash_test.go b/go/arrow/compute/vector_hash_test.go index 9410720de7941..c37db584805d0 100644 --- a/go/arrow/compute/vector_hash_test.go +++ b/go/arrow/compute/vector_hash_test.go @@ -26,7 +26,6 @@ import ( "github.com/apache/arrow/go/v15/arrow" "github.com/apache/arrow/go/v15/arrow/array" "github.com/apache/arrow/go/v15/arrow/compute" - "github.com/apache/arrow/go/v15/arrow/compute/exec" "github.com/apache/arrow/go/v15/arrow/decimal128" "github.com/apache/arrow/go/v15/arrow/decimal256" "github.com/apache/arrow/go/v15/arrow/memory" @@ -36,7 +35,7 @@ import ( "golang.org/x/exp/constraints" ) -func checkUniqueDict[I exec.IntTypes | exec.UintTypes](t *testing.T, input compute.ArrayLikeDatum, expected arrow.Array) { +func checkUniqueDict[I arrow.IntType | arrow.UintType](t *testing.T, input compute.ArrayLikeDatum, expected arrow.Array) { out, err := compute.Unique(context.TODO(), input) require.NoError(t, err) defer out.Release() @@ -52,8 +51,8 @@ func checkUniqueDict[I exec.IntTypes | exec.UintTypes](t *testing.T, input compu require.Truef(t, array.Equal(exDict, resultDict), "wanted: %s\ngot: %s", exDict, resultDict) - want := exec.GetValues[I](expected.(*array.Dictionary).Indices().Data(), 1) - got := exec.GetValues[I](result.Indices().Data(), 1) + want := arrow.GetValues[I](expected.(*array.Dictionary).Indices().Data(), 1) + got := arrow.GetValues[I](result.Indices().Data(), 1) assert.ElementsMatchf(t, got, want, "wanted: %s\ngot: %s", want, got) } @@ -81,15 +80,15 @@ func checkDictionaryUnique(t *testing.T, input compute.ArrayLikeDatum, expected } } -func checkUniqueFixedWidth[T exec.FixedWidthTypes](t *testing.T, input, expected arrow.Array) { +func checkUniqueFixedWidth[T arrow.FixedWidthType](t *testing.T, input, expected arrow.Array) { result, err := compute.UniqueArray(context.TODO(), input) require.NoError(t, err) defer result.Release() require.Truef(t, arrow.TypeEqual(result.DataType(), expected.DataType()), "wanted: %s\ngot: %s", expected.DataType(), result.DataType()) - want := exec.GetValues[T](expected.Data(), 1) - got := exec.GetValues[T](expected.Data(), 1) + want := arrow.GetValues[T](expected.Data(), 1) + got := arrow.GetValues[T](expected.Data(), 1) assert.ElementsMatchf(t, got, want, "wanted: %s\ngot: %s", want, got) } @@ -106,7 +105,7 @@ func checkUniqueVariableWidth[OffsetType int32 | int64](t *testing.T, input, exp createSlice := func(v arrow.Array) [][]byte { var ( - offsets = exec.GetOffsets[OffsetType](v.Data(), 1) + offsets = arrow.GetOffsets[OffsetType](v.Data(), 1) data = v.Data().Buffers()[2].Bytes() out = make([][]byte, v.Len()) ) @@ -124,7 +123,7 @@ func checkUniqueVariableWidth[OffsetType int32 | int64](t *testing.T, input, exp } type ArrowType interface { - exec.FixedWidthTypes | string | []byte + arrow.FixedWidthType | string | []byte } type builder[T ArrowType] interface { @@ -166,7 +165,7 @@ func checkUniqueFixedSizeBinary(t *testing.T, mem memory.Allocator, dt *arrow.Fi assert.ElementsMatch(t, want, got) } -func checkUniqueFW[T exec.FixedWidthTypes](t *testing.T, mem memory.Allocator, dt arrow.DataType, inValues, outValues []T, inValid, outValid []bool) { +func checkUniqueFW[T arrow.FixedWidthType](t *testing.T, mem memory.Allocator, dt arrow.DataType, inValues, outValues []T, inValid, outValid []bool) { input := makeArray(mem, dt, inValues, inValid) defer input.Release() expected := makeArray(mem, dt, outValues, outValid) @@ -189,7 +188,7 @@ func checkUniqueVW[T string | []byte](t *testing.T, mem memory.Allocator, dt arr } } -type PrimitiveHashKernelSuite[T exec.IntTypes | exec.UintTypes | constraints.Float] struct { +type PrimitiveHashKernelSuite[T arrow.IntType | arrow.UintType | constraints.Float] struct { suite.Suite mem *memory.CheckedAllocator @@ -197,7 +196,7 @@ type PrimitiveHashKernelSuite[T exec.IntTypes | exec.UintTypes | constraints.Flo } func (ps *PrimitiveHashKernelSuite[T]) SetupSuite() { - ps.dt = exec.GetDataType[T]() + ps.dt = arrow.GetDataType[T]() } func (ps *PrimitiveHashKernelSuite[T]) SetupTest() { diff --git a/go/arrow/compute/vector_selection_test.go b/go/arrow/compute/vector_selection_test.go index f44840ba72034..4e38bc995cdfc 100644 --- a/go/arrow/compute/vector_selection_test.go +++ b/go/arrow/compute/vector_selection_test.go @@ -459,9 +459,9 @@ func (f *FilterKernelNumeric) TestFilterNumeric() { }) } -type comparator[T exec.NumericTypes] func(a, b T) bool +type comparator[T arrow.NumericType] func(a, b T) bool -func getComparator[T exec.NumericTypes](op kernels.CompareOperator) comparator[T] { +func getComparator[T arrow.NumericType](op kernels.CompareOperator) comparator[T] { return []comparator[T]{ // EQUAL func(a, b T) bool { return a == b }, @@ -478,7 +478,7 @@ func getComparator[T exec.NumericTypes](op kernels.CompareOperator) comparator[T }[int8(op)] } -func compareAndFilterImpl[T exec.NumericTypes](mem memory.Allocator, data []T, fn func(T) bool) arrow.Array { +func compareAndFilterImpl[T arrow.NumericType](mem memory.Allocator, data []T, fn func(T) bool) arrow.Array { filtered := make([]T, 0, len(data)) for _, v := range data { if fn(v) { @@ -488,12 +488,12 @@ func compareAndFilterImpl[T exec.NumericTypes](mem memory.Allocator, data []T, f return exec.ArrayFromSlice(mem, filtered) } -func compareAndFilterValue[T exec.NumericTypes](mem memory.Allocator, data []T, val T, op kernels.CompareOperator) arrow.Array { +func compareAndFilterValue[T arrow.NumericType](mem memory.Allocator, data []T, val T, op kernels.CompareOperator) arrow.Array { cmp := getComparator[T](op) return compareAndFilterImpl(mem, data, func(e T) bool { return cmp(e, val) }) } -func compareAndFilterSlice[T exec.NumericTypes](mem memory.Allocator, data, other []T, op kernels.CompareOperator) arrow.Array { +func compareAndFilterSlice[T arrow.NumericType](mem memory.Allocator, data, other []T, op kernels.CompareOperator) arrow.Array { cmp := getComparator[T](op) i := 0 return compareAndFilterImpl(mem, data, func(e T) bool { @@ -503,7 +503,7 @@ func compareAndFilterSlice[T exec.NumericTypes](mem memory.Allocator, data, othe }) } -func createFilterImpl[T exec.NumericTypes](mem memory.Allocator, data []T, fn func(T) bool) arrow.Array { +func createFilterImpl[T arrow.NumericType](mem memory.Allocator, data []T, fn func(T) bool) arrow.Array { bldr := array.NewBooleanBuilder(mem) defer bldr.Release() for _, v := range data { @@ -512,12 +512,12 @@ func createFilterImpl[T exec.NumericTypes](mem memory.Allocator, data []T, fn fu return bldr.NewArray() } -func createFilterValue[T exec.NumericTypes](mem memory.Allocator, data []T, val T, op kernels.CompareOperator) arrow.Array { +func createFilterValue[T arrow.NumericType](mem memory.Allocator, data []T, val T, op kernels.CompareOperator) arrow.Array { cmp := getComparator[T](op) return createFilterImpl(mem, data, func(e T) bool { return cmp(e, val) }) } -func createFilterSlice[T exec.NumericTypes](mem memory.Allocator, data, other []T, op kernels.CompareOperator) arrow.Array { +func createFilterSlice[T arrow.NumericType](mem memory.Allocator, data, other []T, op kernels.CompareOperator) arrow.Array { cmp := getComparator[T](op) i := 0 return createFilterImpl(mem, data, func(e T) bool { @@ -527,8 +527,8 @@ func createFilterSlice[T exec.NumericTypes](mem memory.Allocator, data, other [] }) } -func compareScalarAndFilterRandomNumeric[T exec.NumericTypes](t *testing.T, mem memory.Allocator) { - dt := exec.GetDataType[T]() +func compareScalarAndFilterRandomNumeric[T arrow.NumericType](t *testing.T, mem memory.Allocator) { + dt := arrow.GetDataType[T]() rng := gen.NewRandomArrayGenerator(randomSeed, mem) t.Run("compare scalar and filter", func(t *testing.T) { @@ -537,7 +537,7 @@ func compareScalarAndFilterRandomNumeric[T exec.NumericTypes](t *testing.T, mem t.Run(fmt.Sprintf("random %d", length), func(t *testing.T) { arr := rng.Numeric(dt.ID(), length, 0, 100, 0) defer arr.Release() - data := exec.GetData[T](arr.Data().Buffers()[1].Bytes()) + data := arrow.GetData[T](arr.Data().Buffers()[1].Bytes()) for _, op := range []kernels.CompareOperator{kernels.CmpEQ, kernels.CmpNE, kernels.CmpGT, kernels.CmpLE} { selection := createFilterValue(mem, data, 50, op) defer selection.Release() @@ -556,8 +556,8 @@ func compareScalarAndFilterRandomNumeric[T exec.NumericTypes](t *testing.T, mem }) } -func compareArrayAndFilterRandomNumeric[T exec.NumericTypes](t *testing.T, mem memory.Allocator) { - dt := exec.GetDataType[T]() +func compareArrayAndFilterRandomNumeric[T arrow.NumericType](t *testing.T, mem memory.Allocator) { + dt := arrow.GetDataType[T]() rng := gen.NewRandomArrayGenerator(randomSeed, mem) t.Run("compare array and filter", func(t *testing.T) { for i := 3; i < 10; i++ { @@ -568,8 +568,8 @@ func compareArrayAndFilterRandomNumeric[T exec.NumericTypes](t *testing.T, mem m rhs := rng.Numeric(dt.ID(), length, 0, 100, 0) defer rhs.Release() - data := exec.GetData[T](lhs.Data().Buffers()[1].Bytes()) - other := exec.GetData[T](rhs.Data().Buffers()[1].Bytes()) + data := arrow.GetData[T](lhs.Data().Buffers()[1].Bytes()) + other := arrow.GetData[T](rhs.Data().Buffers()[1].Bytes()) for _, op := range []kernels.CompareOperator{kernels.CmpEQ, kernels.CmpNE, kernels.CmpGT, kernels.CmpLE} { selection := createFilterSlice(mem, data, other, op) defer selection.Release() diff --git a/go/arrow/flight/doc.go b/go/arrow/flight/doc.go index 68d1ca3458fd4..c36a808b00e4d 100644 --- a/go/arrow/flight/doc.go +++ b/go/arrow/flight/doc.go @@ -74,5 +74,4 @@ // the main thread reset the timer every time a write operation completes successfully // (that means one needs to use to_batches() + write_batch and not write_table). - package flight diff --git a/go/arrow/flight/server.go b/go/arrow/flight/server.go index c9c8b390a86d8..3e1da64dcf8c4 100644 --- a/go/arrow/flight/server.go +++ b/go/arrow/flight/server.go @@ -42,7 +42,7 @@ type ( FlightEndpoint = flight.FlightEndpoint Location = flight.Location FlightInfo = flight.FlightInfo - PollInfo = flight.PollInfo + PollInfo = flight.PollInfo FlightData = flight.FlightData PutResult = flight.PutResult Ticket = flight.Ticket diff --git a/go/arrow/internal/arrjson/arrjson.go b/go/arrow/internal/arrjson/arrjson.go index 84dc638983298..49f711cdacd76 100644 --- a/go/arrow/internal/arrjson/arrjson.go +++ b/go/arrow/internal/arrjson/arrjson.go @@ -826,7 +826,8 @@ type Array struct { Offset interface{} `json:"OFFSET,omitempty"` Size interface{} `json:"SIZE,omitempty"` Children []Array `json:"children,omitempty"` - Variadic []string `json:"VARIADIC_BUFFERS,omitempty"` + Variadic []string `json:"VARIADIC_DATA_BUFFERS,omitempty"` + Views []interface{} `json:"VIEWS,omitempty"` } func (a *Array) MarshalJSON() ([]byte, error) { @@ -1090,7 +1091,7 @@ func arrayFromJSON(mem memory.Allocator, dt arrow.DataType, arr Array) arrow.Arr case arrow.BinaryViewDataType: valids := validsToBitmap(validsFromJSON(arr.Valids), mem) nulls := arr.Count - bitutil.CountSetBits(valids.Bytes(), 0, arr.Count) - headers := stringHeadersFromJSON(mem, !dt.IsUtf8(), arr.Data) + headers := stringHeadersFromJSON(mem, !dt.IsUtf8(), arr.Views) extraBufs := variadicBuffersFromJSON(arr.Variadic) defer valids.Release() defer headers.Release() @@ -1513,7 +1514,7 @@ func arrayToJSON(field arrow.Field, arr arrow.Array) Array { Name: field.Name, Count: arr.Len(), Valids: validsToJSON(arr), - Data: stringHeadersToJSON(arr, false), + Views: stringHeadersToJSON(arr, false), Variadic: variadic, } case *array.BinaryView: @@ -1522,7 +1523,7 @@ func arrayToJSON(field arrow.Field, arr arrow.Array) Array { Name: field.Name, Count: arr.Len(), Valids: validsToJSON(arr), - Data: stringHeadersToJSON(arr, true), + Views: stringHeadersToJSON(arr, true), Variadic: variadic, } case *array.List: @@ -2406,7 +2407,7 @@ func stringHeadersFromJSON(mem memory.Allocator, isBinary bool, data []interface } values[i].SetIndexOffset(int32(bufIdx), int32(bufOffset)) - prefix, err := hex.DecodeString(v["PREFIX"].(string)) + prefix, err := hex.DecodeString(v["PREFIX_HEX"].(string)) if err != nil { panic(err) } @@ -2426,7 +2427,7 @@ func stringHeadersFromJSON(mem memory.Allocator, isBinary bool, data []interface func stringHeadersToJSON(arr array.ViewLike, isBinary bool) []interface{} { type StringHeader struct { Size int `json:"SIZE"` - Prefix *string `json:"PREFIX,omitempty"` + Prefix *string `json:"PREFIX_HEX,omitempty"` BufferIdx *int `json:"BUFFER_INDEX,omitempty"` BufferOff *int `json:"OFFSET,omitempty"` Inlined *string `json:"INLINED,omitempty"` diff --git a/go/arrow/internal/arrjson/arrjson_test.go b/go/arrow/internal/arrjson/arrjson_test.go index 31f3cb238ec16..164210cbc230d 100644 --- a/go/arrow/internal/arrjson/arrjson_test.go +++ b/go/arrow/internal/arrjson/arrjson_test.go @@ -6165,7 +6165,7 @@ func makeViewTypesWantJSONs() string { 1, 1 ], - "DATA": [ + "VIEWS": [ { "SIZE": 3, "INLINED": "31C3A9" @@ -6187,7 +6187,7 @@ func makeViewTypesWantJSONs() string { "INLINED": "35" } ], - "VARIADIC_BUFFERS": [""] + "VARIADIC_DATA_BUFFERS": [""] }, { "name": "string_view", @@ -6199,7 +6199,7 @@ func makeViewTypesWantJSONs() string { 1, 1 ], - "DATA": [ + "VIEWS": [ { "SIZE": 3, "INLINED": "1é" @@ -6221,7 +6221,7 @@ func makeViewTypesWantJSONs() string { "INLINED": "5" } ], - "VARIADIC_BUFFERS": [""] + "VARIADIC_DATA_BUFFERS": [""] } ] }, @@ -6238,7 +6238,7 @@ func makeViewTypesWantJSONs() string { 1, 1 ], - "DATA": [ + "VIEWS": [ { "SIZE": 3, "INLINED": "31C3A9" @@ -6260,7 +6260,7 @@ func makeViewTypesWantJSONs() string { "INLINED": "35353535" } ], - "VARIADIC_BUFFERS": [""] + "VARIADIC_DATA_BUFFERS": [""] }, { "name": "string_view", @@ -6272,20 +6272,20 @@ func makeViewTypesWantJSONs() string { 1, 1 ], - "DATA": [ + "VIEWS": [ { "SIZE": 3, "INLINED": "1é" }, { "SIZE": 14, - "PREFIX": "32323232", + "PREFIX_HEX": "32323232", "BUFFER_INDEX": 0, "OFFSET": 0 }, { "SIZE": 14, - "PREFIX": "33333333", + "PREFIX_HEX": "33333333", "BUFFER_INDEX": 0, "OFFSET": 14 }, @@ -6298,7 +6298,7 @@ func makeViewTypesWantJSONs() string { "INLINED": "5555" } ], - "VARIADIC_BUFFERS": [ + "VARIADIC_DATA_BUFFERS": [ "32323232323232323232323232323333333333333333333333333333" ] } @@ -6317,20 +6317,20 @@ func makeViewTypesWantJSONs() string { 1, 1 ], - "DATA": [ + "VIEWS": [ { "SIZE": 6, "INLINED": "31C3A931C3A9" }, { "SIZE": 14, - "PREFIX": "32323232", + "PREFIX_HEX": "32323232", "BUFFER_INDEX": 0, "OFFSET": 0 }, { "SIZE": 14, - "PREFIX": "33333333", + "PREFIX_HEX": "33333333", "BUFFER_INDEX": 0, "OFFSET": 14 }, @@ -6343,7 +6343,7 @@ func makeViewTypesWantJSONs() string { "INLINED": "3535" } ], - "VARIADIC_BUFFERS": [ + "VARIADIC_DATA_BUFFERS": [ "32323232323232323232323232323333333333333333333333333333" ] }, @@ -6357,7 +6357,7 @@ func makeViewTypesWantJSONs() string { 1, 1 ], - "DATA": [ + "VIEWS": [ { "SIZE": 6, "INLINED": "1é1é" @@ -6379,7 +6379,7 @@ func makeViewTypesWantJSONs() string { "INLINED": "55" } ], - "VARIADIC_BUFFERS": [""] + "VARIADIC_DATA_BUFFERS": [""] } ] } diff --git a/go/arrow/internal/testing/tools/bits.go b/go/arrow/internal/testing/tools/bits.go index c123573e2fa13..ea6a5432e5c91 100644 --- a/go/arrow/internal/testing/tools/bits.go +++ b/go/arrow/internal/testing/tools/bits.go @@ -22,7 +22,7 @@ import "math/bits" // The low bit of each nibble is tested, therefore integers should be written as 8-digit // hex numbers consisting of 1s or 0s. // -// IntsToBitsLSB(0x11001010) -> 0x35 +// IntsToBitsLSB(0x11001010) -> 0x35 func IntsToBitsLSB(v ...int32) []byte { res := make([]byte, 0, len(v)) for _, b := range v { diff --git a/go/arrow/internal/utils.go b/go/arrow/internal/utils.go index 619eebd97dc78..7b5df167ea432 100644 --- a/go/arrow/internal/utils.go +++ b/go/arrow/internal/utils.go @@ -45,3 +45,15 @@ func HasValidityBitmap(id arrow.Type, version flatbuf.MetadataVersion) bool { } return true } + +// HasBufferSizesBuffer returns whether a given type has an extra buffer +// in the C ABI to store the sizes of other buffers. Currently this is only +// StringView and BinaryView. +func HasBufferSizesBuffer(id arrow.Type) bool { + switch id { + case arrow.STRING_VIEW, arrow.BINARY_VIEW: + return true + default: + return false + } +} diff --git a/go/arrow/ipc/file_reader.go b/go/arrow/ipc/file_reader.go index dd51a761510d8..7bc7f6ebfaa09 100644 --- a/go/arrow/ipc/file_reader.go +++ b/go/arrow/ipc/file_reader.go @@ -521,7 +521,7 @@ func (ctx *arrayLoaderContext) loadArray(dt arrow.DataType) arrow.ArrayData { case *arrow.RunEndEncodedType: field, buffers := ctx.loadCommon(dt.ID(), 1) - defer releaseBuffers(buffers) + defer memory.ReleaseBuffers(buffers) runEnds := ctx.loadChild(dt.RunEnds()) defer runEnds.Release() @@ -583,7 +583,7 @@ func (ctx *arrayLoaderContext) loadPrimitive(dt arrow.DataType) arrow.ArrayData buffers = append(buffers, ctx.buffer()) } - defer releaseBuffers(buffers) + defer memory.ReleaseBuffers(buffers) return array.NewData(dt, int(field.Length()), buffers, nil, int(field.NullCount()), 0) } @@ -591,7 +591,7 @@ func (ctx *arrayLoaderContext) loadPrimitive(dt arrow.DataType) arrow.ArrayData func (ctx *arrayLoaderContext) loadBinary(dt arrow.DataType) arrow.ArrayData { field, buffers := ctx.loadCommon(dt.ID(), 3) buffers = append(buffers, ctx.buffer(), ctx.buffer()) - defer releaseBuffers(buffers) + defer memory.ReleaseBuffers(buffers) return array.NewData(dt, int(field.Length()), buffers, nil, int(field.NullCount()), 0) } @@ -603,7 +603,7 @@ func (ctx *arrayLoaderContext) loadBinaryView(dt arrow.DataType) arrow.ArrayData for i := 0; i < int(nVariadicBufs); i++ { buffers = append(buffers, ctx.buffer()) } - defer releaseBuffers(buffers) + defer memory.ReleaseBuffers(buffers) return array.NewData(dt, int(field.Length()), buffers, nil, int(field.NullCount()), 0) } @@ -611,7 +611,7 @@ func (ctx *arrayLoaderContext) loadBinaryView(dt arrow.DataType) arrow.ArrayData func (ctx *arrayLoaderContext) loadFixedSizeBinary(dt *arrow.FixedSizeBinaryType) arrow.ArrayData { field, buffers := ctx.loadCommon(dt.ID(), 2) buffers = append(buffers, ctx.buffer()) - defer releaseBuffers(buffers) + defer memory.ReleaseBuffers(buffers) return array.NewData(dt, int(field.Length()), buffers, nil, int(field.NullCount()), 0) } @@ -619,7 +619,7 @@ func (ctx *arrayLoaderContext) loadFixedSizeBinary(dt *arrow.FixedSizeBinaryType func (ctx *arrayLoaderContext) loadMap(dt *arrow.MapType) arrow.ArrayData { field, buffers := ctx.loadCommon(dt.ID(), 2) buffers = append(buffers, ctx.buffer()) - defer releaseBuffers(buffers) + defer memory.ReleaseBuffers(buffers) sub := ctx.loadChild(dt.Elem()) defer sub.Release() @@ -630,7 +630,7 @@ func (ctx *arrayLoaderContext) loadMap(dt *arrow.MapType) arrow.ArrayData { func (ctx *arrayLoaderContext) loadList(dt arrow.ListLikeType) arrow.ArrayData { field, buffers := ctx.loadCommon(dt.ID(), 2) buffers = append(buffers, ctx.buffer()) - defer releaseBuffers(buffers) + defer memory.ReleaseBuffers(buffers) sub := ctx.loadChild(dt.Elem()) defer sub.Release() @@ -641,7 +641,7 @@ func (ctx *arrayLoaderContext) loadList(dt arrow.ListLikeType) arrow.ArrayData { func (ctx *arrayLoaderContext) loadListView(dt arrow.VarLenListLikeType) arrow.ArrayData { field, buffers := ctx.loadCommon(dt.ID(), 3) buffers = append(buffers, ctx.buffer(), ctx.buffer()) - defer releaseBuffers(buffers) + defer memory.ReleaseBuffers(buffers) sub := ctx.loadChild(dt.Elem()) defer sub.Release() @@ -651,7 +651,7 @@ func (ctx *arrayLoaderContext) loadListView(dt arrow.VarLenListLikeType) arrow.A func (ctx *arrayLoaderContext) loadFixedSizeList(dt *arrow.FixedSizeListType) arrow.ArrayData { field, buffers := ctx.loadCommon(dt.ID(), 1) - defer releaseBuffers(buffers) + defer memory.ReleaseBuffers(buffers) sub := ctx.loadChild(dt.Elem()) defer sub.Release() @@ -661,7 +661,7 @@ func (ctx *arrayLoaderContext) loadFixedSizeList(dt *arrow.FixedSizeListType) ar func (ctx *arrayLoaderContext) loadStruct(dt *arrow.StructType) arrow.ArrayData { field, buffers := ctx.loadCommon(dt.ID(), 1) - defer releaseBuffers(buffers) + defer memory.ReleaseBuffers(buffers) subs := make([]arrow.ArrayData, dt.NumFields()) for i, f := range dt.Fields() { @@ -704,7 +704,7 @@ func (ctx *arrayLoaderContext) loadUnion(dt arrow.UnionType) arrow.ArrayData { } } - defer releaseBuffers(buffers) + defer memory.ReleaseBuffers(buffers) subs := make([]arrow.ArrayData, dt.NumFields()) for i, f := range dt.Fields() { subs[i] = ctx.loadChild(f.Type) @@ -768,11 +768,3 @@ func readDictionary(memo *dictutils.Memo, meta *memory.Buffer, body ReadAtSeeker } return dictutils.KindReplacement, nil } - -func releaseBuffers(buffers []*memory.Buffer) { - for _, b := range buffers { - if b != nil { - b.Release() - } - } -} diff --git a/go/arrow/ipc/writer.go b/go/arrow/ipc/writer.go index e9d59f0e35e00..31ce53a0f1af7 100644 --- a/go/arrow/ipc/writer.go +++ b/go/arrow/ipc/writer.go @@ -34,6 +34,7 @@ import ( "github.com/apache/arrow/go/v15/arrow/internal/dictutils" "github.com/apache/arrow/go/v15/arrow/internal/flatbuf" "github.com/apache/arrow/go/v15/arrow/memory" + "github.com/apache/arrow/go/v15/internal/utils" ) type swriter struct { @@ -746,42 +747,22 @@ func (w *recordEncoder) visit(p *Payload, arr arrow.Array) error { w.depth++ case *arrow.ListViewType, *arrow.LargeListViewType: - data := arr.Data() arr := arr.(array.VarLenListLike) - offsetTraits := arr.DataType().(arrow.OffsetsDataType).OffsetTypeTraits() - rngOff, rngLen := array.RangeOfValuesUsed(arr) - voffsets := w.getValueOffsetsAtBaseValue(arr, rngOff) - p.body = append(p.body, voffsets) - vsizes := data.Buffers()[2] - if vsizes != nil { - if data.Offset() != 0 || vsizes.Len() > offsetTraits.BytesRequired(arr.Len()) { - beg := offsetTraits.BytesRequired(data.Offset()) - end := beg + offsetTraits.BytesRequired(data.Len()) - vsizes = memory.NewBufferBytes(vsizes.Bytes()[beg:end]) - } else { - vsizes.Retain() - } - } + voffsets, minOffset, maxEnd := w.getZeroBasedListViewOffsets(arr) + vsizes := w.getListViewSizes(arr) + + p.body = append(p.body, voffsets) p.body = append(p.body, vsizes) w.depth-- var ( - values = arr.ListValues() - mustRelease = false - values_offset = int64(rngOff) - values_end = int64(rngOff + rngLen) + values = arr.ListValues() ) - defer func() { - if mustRelease { - values.Release() - } - }() - if arr.Len() > 0 && values_end < int64(values.Len()) { - // must also slice the values - values = array.NewSlice(values, values_offset, values_end) - mustRelease = true + if minOffset != 0 || maxEnd < int64(values.Len()) { + values = array.NewSlice(values, minOffset, maxEnd) + defer values.Release() } err := w.visit(p, values) @@ -882,61 +863,92 @@ func (w *recordEncoder) getZeroBasedValueOffsets(arr arrow.Array) *memory.Buffer return voffsets } -// Truncates the offsets if needed and shifts the values if minOffset > 0. -// The offsets returned are corrected assuming the child values are truncated -// and now start at minOffset. -// -// This function only works on offset buffers of ListViews and LargeListViews. -// TODO(felipecrv): Unify this with getZeroBasedValueOffsets. -func (w *recordEncoder) getValueOffsetsAtBaseValue(arr arrow.Array, minOffset int) *memory.Buffer { - data := arr.Data() - voffsets := data.Buffers()[1] - offsetTraits := arr.DataType().(arrow.OffsetsDataType).OffsetTypeTraits() - offsetBytesNeeded := offsetTraits.BytesRequired(data.Len()) +func getZeroBasedListViewOffsets[OffsetT int32 | int64](mem memory.Allocator, arr array.VarLenListLike) (valueOffsets *memory.Buffer, minOffset, maxEnd OffsetT) { + requiredBytes := int(unsafe.Sizeof(minOffset)) * arr.Len() + if arr.Data().Offset() == 0 { + // slice offsets to used extent, in case we have truncated slice + minOffset, maxEnd = 0, OffsetT(arr.ListValues().Len()) + valueOffsets = arr.Data().Buffers()[1] + if valueOffsets.Len() > requiredBytes { + valueOffsets = memory.SliceBuffer(valueOffsets, 0, requiredBytes) + } else { + valueOffsets.Retain() + } + return + } - if voffsets == nil || voffsets.Len() == 0 { - return nil + // non-zero offset, it's likely that the smallest offset is not zero + // we must a) create a new offsets array with shifted offsets and + // b) slice the values array accordingly + + valueOffsets = memory.NewResizableBuffer(mem) + valueOffsets.Resize(requiredBytes) + if arr.Len() > 0 { + // max value of int32/int64 based on type + minOffset = (^OffsetT(0)) << ((8 * unsafe.Sizeof(minOffset)) - 1) + for i := 0; i < arr.Len(); i++ { + start, end := arr.ValueOffsets(i) + minOffset = utils.Min(minOffset, OffsetT(start)) + maxEnd = utils.Max(maxEnd, OffsetT(end)) + } + } + + offsets := arrow.GetData[OffsetT](arr.Data().Buffers()[1].Bytes())[arr.Data().Offset():] + destOffset := arrow.GetData[OffsetT](valueOffsets.Bytes()) + for i := 0; i < arr.Len(); i++ { + destOffset[i] = offsets[i] - minOffset } + return +} - needsTruncate := data.Offset() != 0 || offsetBytesNeeded < voffsets.Len() - needsShift := minOffset > 0 +func getListViewSizes[OffsetT int32 | int64](arr array.VarLenListLike) *memory.Buffer { + var z OffsetT + requiredBytes := int(unsafe.Sizeof(z)) * arr.Len() + sizes := arr.Data().Buffers()[2] - if needsTruncate || needsShift { - shiftedOffsets := memory.NewResizableBuffer(w.mem) - shiftedOffsets.Resize(offsetBytesNeeded) + if arr.Data().Offset() != 0 || sizes.Len() > requiredBytes { + // slice offsets to used extent, in case we have truncated slice + offsetBytes := arr.Data().Offset() * int(unsafe.Sizeof(z)) + sizes = memory.SliceBuffer(sizes, offsetBytes, requiredBytes) + } else { + sizes.Retain() + } + return sizes +} - switch arr.DataType().Layout().Buffers[1].ByteWidth { - case 8: - dest := arrow.Int64Traits.CastFromBytes(shiftedOffsets.Bytes()) - offsets := arrow.Int64Traits.CastFromBytes(voffsets.Bytes())[data.Offset() : data.Offset()+data.Len()] +func (w *recordEncoder) getZeroBasedListViewOffsets(arr array.VarLenListLike) (*memory.Buffer, int64, int64) { + if arr.Len() == 0 { + return nil, 0, 0 + } - if minOffset > 0 { - for i, o := range offsets { - dest[i] = o - int64(minOffset) - } - } else { - copy(dest, offsets) - } - default: - debug.Assert(arr.DataType().Layout().Buffers[1].ByteWidth == 4, "invalid offset bytewidth") - dest := arrow.Int32Traits.CastFromBytes(shiftedOffsets.Bytes()) - offsets := arrow.Int32Traits.CastFromBytes(voffsets.Bytes())[data.Offset() : data.Offset()+data.Len()] + var ( + outOffsets *memory.Buffer + minOff, maxEnd int64 + ) - if minOffset > 0 { - for i, o := range offsets { - dest[i] = o - int32(minOffset) - } - } else { - copy(dest, offsets) - } - } + switch v := arr.(type) { + case *array.ListView: + voffsets, outOff, outEnd := getZeroBasedListViewOffsets[int32](w.mem, v) + outOffsets = voffsets + minOff, maxEnd = int64(outOff), int64(outEnd) + case *array.LargeListView: + outOffsets, minOff, maxEnd = getZeroBasedListViewOffsets[int64](w.mem, v) + } + return outOffsets, minOff, maxEnd +} - voffsets = shiftedOffsets - } else { - voffsets.Retain() +func (w *recordEncoder) getListViewSizes(arr array.VarLenListLike) *memory.Buffer { + if arr.Len() == 0 { + return nil } - return voffsets + switch v := arr.(type) { + case *array.ListView: + return getListViewSizes[int32](v) + case *array.LargeListView: + return getListViewSizes[int64](v) + } + return nil } func (w *recordEncoder) rebaseDenseUnionValueOffsets(arr *array.DenseUnion, offsets, lengths []int32) *memory.Buffer { diff --git a/go/arrow/memory/util.go b/go/arrow/memory/util.go index 3b0d3a5cb9ef1..6cc7ec91b9638 100644 --- a/go/arrow/memory/util.go +++ b/go/arrow/memory/util.go @@ -35,3 +35,11 @@ func isMultipleOfPowerOf2(v int, d int) bool { func addressOf(b []byte) uintptr { return uintptr(unsafe.Pointer(&b[0])) } + +func ReleaseBuffers(buffers []*Buffer) { + for _, b := range buffers { + if b != nil { + b.Release() + } + } +} diff --git a/go/arrow/type_traits.go b/go/arrow/type_traits.go new file mode 100644 index 0000000000000..67fa8a266b35f --- /dev/null +++ b/go/arrow/type_traits.go @@ -0,0 +1,162 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package arrow + +import ( + "reflect" + "unsafe" + + "github.com/apache/arrow/go/v15/arrow/decimal128" + "github.com/apache/arrow/go/v15/arrow/decimal256" + "github.com/apache/arrow/go/v15/arrow/float16" + "golang.org/x/exp/constraints" +) + +// IntType is a type constraint for raw values represented as signed +// integer types by We aren't just using constraints.Signed +// because we don't want to include the raw `int` type here whose size +// changes based on the architecture (int32 on 32-bit architectures and +// int64 on 64-bit architectures). +// +// This will also cover types like MonthInterval or the time types +// as their underlying types are int32 and int64 which will get covered +// by using the ~ +type IntType interface { + ~int8 | ~int16 | ~int32 | ~int64 +} + +// UintType is a type constraint for raw values represented as unsigned +// integer types by We aren't just using constraints.Unsigned +// because we don't want to include the raw `uint` type here whose size +// changes based on the architecture (uint32 on 32-bit architectures and +// uint64 on 64-bit architectures). We also don't want to include uintptr +type UintType interface { + ~uint8 | ~uint16 | ~uint32 | ~uint64 +} + +// FloatType is a type constraint for raw values for representing +// floating point values in This consists of constraints.Float and +// float16.Num +type FloatType interface { + float16.Num | constraints.Float +} + +// NumericType is a type constraint for just signed/unsigned integers +// and float32/float64. +type NumericType interface { + IntType | UintType | constraints.Float +} + +// FixedWidthType is a type constraint for raw values in Arrow that +// can be represented as FixedWidth byte slices. Specifically this is for +// using Go generics to easily re-type a byte slice to a properly-typed +// slice. Booleans are excluded here since they are represented by Arrow +// as a bitmap and thus the buffer can't be just reinterpreted as a []bool +type FixedWidthType interface { + IntType | UintType | + FloatType | decimal128.Num | decimal256.Num | + DayTimeInterval | MonthDayNanoInterval +} + +type TemporalType interface { + Date32 | Date64 | Time32 | Time64 | + Timestamp | Duration | DayTimeInterval | + MonthInterval | MonthDayNanoInterval +} + +func reinterpretSlice[Out, T any](b []T) []Out { + if cap(b) == 0 { + return nil + } + out := (*Out)(unsafe.Pointer(&b[:1][0])) + + lenBytes := len(b) * int(unsafe.Sizeof(b[0])) + capBytes := cap(b) * int(unsafe.Sizeof(b[0])) + + lenOut := lenBytes / int(unsafe.Sizeof(*out)) + capOut := capBytes / int(unsafe.Sizeof(*out)) + + return unsafe.Slice(out, capOut)[:lenOut] +} + +// GetValues reinterprets the data.Buffers()[i] to a slice of T with len=data.Len(). +// +// If the buffer is nil, nil will be returned. +// +// NOTE: the buffer's length must be a multiple of Sizeof(T). +func GetValues[T FixedWidthType](data ArrayData, i int) []T { + if data.Buffers()[i] == nil || data.Buffers()[i].Len() == 0 { + return nil + } + return reinterpretSlice[T](data.Buffers()[i].Bytes())[data.Offset() : data.Offset()+data.Len()] +} + +// GetOffsets reinterprets the data.Buffers()[i] to a slice of T with len=data.Len()+1. +// +// NOTE: the buffer's length must be a multiple of Sizeof(T). +func GetOffsets[T int32 | int64](data ArrayData, i int) []T { + return reinterpretSlice[T](data.Buffers()[i].Bytes())[data.Offset() : data.Offset()+data.Len()+1] +} + +// GetBytes reinterprets a slice of T to a slice of bytes. +func GetBytes[T FixedWidthType | ViewHeader](in []T) []byte { + return reinterpretSlice[byte](in) +} + +// GetData reinterprets a slice of bytes to a slice of T. +// +// NOTE: the buffer's length must be a multiple of Sizeof(T). +func GetData[T FixedWidthType | ViewHeader](in []byte) []T { + return reinterpretSlice[T](in) +} + +var typMap = map[reflect.Type]DataType{ + reflect.TypeOf(false): FixedWidthTypes.Boolean, + reflect.TypeOf(int8(0)): PrimitiveTypes.Int8, + reflect.TypeOf(int16(0)): PrimitiveTypes.Int16, + reflect.TypeOf(int32(0)): PrimitiveTypes.Int32, + reflect.TypeOf(int64(0)): PrimitiveTypes.Int64, + reflect.TypeOf(uint8(0)): PrimitiveTypes.Uint8, + reflect.TypeOf(uint16(0)): PrimitiveTypes.Uint16, + reflect.TypeOf(uint32(0)): PrimitiveTypes.Uint32, + reflect.TypeOf(uint64(0)): PrimitiveTypes.Uint64, + reflect.TypeOf(float32(0)): PrimitiveTypes.Float32, + reflect.TypeOf(float64(0)): PrimitiveTypes.Float64, + reflect.TypeOf(string("")): BinaryTypes.String, + reflect.TypeOf(Date32(0)): FixedWidthTypes.Date32, + reflect.TypeOf(Date64(0)): FixedWidthTypes.Date64, + reflect.TypeOf(true): FixedWidthTypes.Boolean, + reflect.TypeOf(float16.Num{}): FixedWidthTypes.Float16, + reflect.TypeOf([]byte{}): BinaryTypes.Binary, +} + +// GetDataType returns the appropriate DataType for the given type T +// only for non-parametric types. This uses a map and reflection internally +// so don't call this in a tight loop, instead call this once and then use +// a closure with the result. +func GetDataType[T NumericType | bool | string | []byte | float16.Num]() DataType { + var z T + return typMap[reflect.TypeOf(z)] +} + +// GetType returns the appropriate Type type T, only for non-parametric +// types. This uses a map and reflection internally so don't call this in +// a tight loop, instead call it once and then use a closure with the result. +func GetType[T NumericType | bool | string]() Type { + var z T + return typMap[reflect.TypeOf(z)].ID() +} diff --git a/go/arrow/type_traits_decimal128.go b/go/arrow/type_traits_decimal128.go index f573ad3c65a4c..d600ba29c1186 100644 --- a/go/arrow/type_traits_decimal128.go +++ b/go/arrow/type_traits_decimal128.go @@ -17,7 +17,6 @@ package arrow import ( - "reflect" "unsafe" "github.com/apache/arrow/go/v15/arrow/decimal128" @@ -47,16 +46,12 @@ func (decimal128Traits) PutValue(b []byte, v decimal128.Num) { // // NOTE: len(b) must be a multiple of Uint16SizeBytes. func (decimal128Traits) CastFromBytes(b []byte) []decimal128.Num { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*decimal128.Num)(unsafe.Pointer(h.Data)), cap(b)/Decimal128SizeBytes)[:len(b)/Decimal128SizeBytes] + return GetData[decimal128.Num](b) } // CastToBytes reinterprets the slice b to a slice of bytes. func (decimal128Traits) CastToBytes(b []decimal128.Num) []byte { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*byte)(unsafe.Pointer(h.Data)), cap(b)*Decimal128SizeBytes)[:len(b)*Decimal128SizeBytes] + return GetBytes(b) } // Copy copies src to dst. diff --git a/go/arrow/type_traits_decimal256.go b/go/arrow/type_traits_decimal256.go index adf3cc3e0bc31..fded46a0a52d0 100644 --- a/go/arrow/type_traits_decimal256.go +++ b/go/arrow/type_traits_decimal256.go @@ -17,7 +17,6 @@ package arrow import ( - "reflect" "unsafe" "github.com/apache/arrow/go/v15/arrow/decimal256" @@ -44,15 +43,11 @@ func (decimal256Traits) PutValue(b []byte, v decimal256.Num) { // CastFromBytes reinterprets the slice b to a slice of decimal256 func (decimal256Traits) CastFromBytes(b []byte) []decimal256.Num { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*decimal256.Num)(unsafe.Pointer(h.Data)), cap(b)/Decimal256SizeBytes)[:len(b)/Decimal256SizeBytes] + return GetData[decimal256.Num](b) } func (decimal256Traits) CastToBytes(b []decimal256.Num) []byte { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*byte)(unsafe.Pointer(h.Data)), cap(b)*Decimal256SizeBytes)[:len(b)*Decimal256SizeBytes] + return GetBytes(b) } func (decimal256Traits) Copy(dst, src []decimal256.Num) { copy(dst, src) } diff --git a/go/arrow/type_traits_float16.go b/go/arrow/type_traits_float16.go index e59efd4c248d8..5369ad352f839 100644 --- a/go/arrow/type_traits_float16.go +++ b/go/arrow/type_traits_float16.go @@ -17,7 +17,6 @@ package arrow import ( - "reflect" "unsafe" "github.com/apache/arrow/go/v15/arrow/endian" @@ -46,16 +45,12 @@ func (float16Traits) PutValue(b []byte, v float16.Num) { // // NOTE: len(b) must be a multiple of Uint16SizeBytes. func (float16Traits) CastFromBytes(b []byte) []float16.Num { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*float16.Num)(unsafe.Pointer(h.Data)), cap(b)/Float16SizeBytes)[:len(b)/Float16SizeBytes] + return GetData[float16.Num](b) } // CastToBytes reinterprets the slice b to a slice of bytes. func (float16Traits) CastToBytes(b []float16.Num) []byte { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*byte)(unsafe.Pointer(h.Data)), cap(b)*Float16SizeBytes)[:len(b)*Float16SizeBytes] + return GetBytes(b) } // Copy copies src to dst. diff --git a/go/arrow/type_traits_interval.go b/go/arrow/type_traits_interval.go index 5fbd7a5248918..ca530a72323ff 100644 --- a/go/arrow/type_traits_interval.go +++ b/go/arrow/type_traits_interval.go @@ -17,7 +17,6 @@ package arrow import ( - "reflect" "unsafe" "github.com/apache/arrow/go/v15/arrow/endian" @@ -57,16 +56,12 @@ func (monthTraits) PutValue(b []byte, v MonthInterval) { // // NOTE: len(b) must be a multiple of MonthIntervalSizeBytes. func (monthTraits) CastFromBytes(b []byte) []MonthInterval { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*MonthInterval)(unsafe.Pointer(h.Data)), cap(b)/MonthIntervalSizeBytes)[:len(b)/MonthIntervalSizeBytes] + return GetData[MonthInterval](b) } // CastToBytes reinterprets the slice b to a slice of bytes. func (monthTraits) CastToBytes(b []MonthInterval) []byte { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*byte)(unsafe.Pointer(h.Data)), cap(b)*MonthIntervalSizeBytes)[:len(b)*MonthIntervalSizeBytes] + return GetBytes(b) } // Copy copies src to dst. @@ -94,16 +89,12 @@ func (daytimeTraits) PutValue(b []byte, v DayTimeInterval) { // // NOTE: len(b) must be a multiple of DayTimeIntervalSizeBytes. func (daytimeTraits) CastFromBytes(b []byte) []DayTimeInterval { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*DayTimeInterval)(unsafe.Pointer(h.Data)), cap(b)/DayTimeIntervalSizeBytes)[:len(b)/DayTimeIntervalSizeBytes] + return GetData[DayTimeInterval](b) } // CastToBytes reinterprets the slice b to a slice of bytes. func (daytimeTraits) CastToBytes(b []DayTimeInterval) []byte { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*byte)(unsafe.Pointer(h.Data)), cap(b)*DayTimeIntervalSizeBytes)[:len(b)*DayTimeIntervalSizeBytes] + return GetBytes(b) } // Copy copies src to dst. @@ -132,16 +123,12 @@ func (monthDayNanoTraits) PutValue(b []byte, v MonthDayNanoInterval) { // // NOTE: len(b) must be a multiple of MonthDayNanoIntervalSizeBytes. func (monthDayNanoTraits) CastFromBytes(b []byte) []MonthDayNanoInterval { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*MonthDayNanoInterval)(unsafe.Pointer(h.Data)), cap(b)/MonthDayNanoIntervalSizeBytes)[:len(b)/MonthDayNanoIntervalSizeBytes] + return GetData[MonthDayNanoInterval](b) } // CastToBytes reinterprets the slice b to a slice of bytes. func (monthDayNanoTraits) CastToBytes(b []MonthDayNanoInterval) []byte { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*byte)(unsafe.Pointer(h.Data)), cap(b)*MonthDayNanoIntervalSizeBytes)[:len(b)*MonthDayNanoIntervalSizeBytes] + return GetBytes(b) } // Copy copies src to dst. diff --git a/go/arrow/type_traits_numeric.gen.go b/go/arrow/type_traits_numeric.gen.go index 57606c0fce6df..06412466032f9 100644 --- a/go/arrow/type_traits_numeric.gen.go +++ b/go/arrow/type_traits_numeric.gen.go @@ -20,7 +20,6 @@ package arrow import ( "math" - "reflect" "unsafe" "github.com/apache/arrow/go/v15/arrow/endian" @@ -65,16 +64,12 @@ func (int64Traits) PutValue(b []byte, v int64) { // // NOTE: len(b) must be a multiple of Int64SizeBytes. func (int64Traits) CastFromBytes(b []byte) []int64 { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*int64)(unsafe.Pointer(h.Data)), cap(b)/Int64SizeBytes)[:len(b)/Int64SizeBytes] + return GetData[int64](b) } // CastToBytes reinterprets the slice b to a slice of bytes. func (int64Traits) CastToBytes(b []int64) []byte { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*byte)(unsafe.Pointer(h.Data)), cap(b)*Int64SizeBytes)[:len(b)*Int64SizeBytes] + return GetBytes(b) } // Copy copies src to dst. @@ -101,16 +96,12 @@ func (uint64Traits) PutValue(b []byte, v uint64) { // // NOTE: len(b) must be a multiple of Uint64SizeBytes. func (uint64Traits) CastFromBytes(b []byte) []uint64 { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*uint64)(unsafe.Pointer(h.Data)), cap(b)/Uint64SizeBytes)[:len(b)/Uint64SizeBytes] + return GetData[uint64](b) } // CastToBytes reinterprets the slice b to a slice of bytes. func (uint64Traits) CastToBytes(b []uint64) []byte { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*byte)(unsafe.Pointer(h.Data)), cap(b)*Uint64SizeBytes)[:len(b)*Uint64SizeBytes] + return GetBytes(b) } // Copy copies src to dst. @@ -137,16 +128,12 @@ func (float64Traits) PutValue(b []byte, v float64) { // // NOTE: len(b) must be a multiple of Float64SizeBytes. func (float64Traits) CastFromBytes(b []byte) []float64 { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*float64)(unsafe.Pointer(h.Data)), cap(b)/Float64SizeBytes)[:len(b)/Float64SizeBytes] + return GetData[float64](b) } // CastToBytes reinterprets the slice b to a slice of bytes. func (float64Traits) CastToBytes(b []float64) []byte { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*byte)(unsafe.Pointer(h.Data)), cap(b)*Float64SizeBytes)[:len(b)*Float64SizeBytes] + return GetBytes(b) } // Copy copies src to dst. @@ -173,16 +160,12 @@ func (int32Traits) PutValue(b []byte, v int32) { // // NOTE: len(b) must be a multiple of Int32SizeBytes. func (int32Traits) CastFromBytes(b []byte) []int32 { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*int32)(unsafe.Pointer(h.Data)), cap(b)/Int32SizeBytes)[:len(b)/Int32SizeBytes] + return GetData[int32](b) } // CastToBytes reinterprets the slice b to a slice of bytes. func (int32Traits) CastToBytes(b []int32) []byte { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*byte)(unsafe.Pointer(h.Data)), cap(b)*Int32SizeBytes)[:len(b)*Int32SizeBytes] + return GetBytes(b) } // Copy copies src to dst. @@ -209,16 +192,12 @@ func (uint32Traits) PutValue(b []byte, v uint32) { // // NOTE: len(b) must be a multiple of Uint32SizeBytes. func (uint32Traits) CastFromBytes(b []byte) []uint32 { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*uint32)(unsafe.Pointer(h.Data)), cap(b)/Uint32SizeBytes)[:len(b)/Uint32SizeBytes] + return GetData[uint32](b) } // CastToBytes reinterprets the slice b to a slice of bytes. func (uint32Traits) CastToBytes(b []uint32) []byte { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*byte)(unsafe.Pointer(h.Data)), cap(b)*Uint32SizeBytes)[:len(b)*Uint32SizeBytes] + return GetBytes(b) } // Copy copies src to dst. @@ -245,16 +224,12 @@ func (float32Traits) PutValue(b []byte, v float32) { // // NOTE: len(b) must be a multiple of Float32SizeBytes. func (float32Traits) CastFromBytes(b []byte) []float32 { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*float32)(unsafe.Pointer(h.Data)), cap(b)/Float32SizeBytes)[:len(b)/Float32SizeBytes] + return GetData[float32](b) } // CastToBytes reinterprets the slice b to a slice of bytes. func (float32Traits) CastToBytes(b []float32) []byte { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*byte)(unsafe.Pointer(h.Data)), cap(b)*Float32SizeBytes)[:len(b)*Float32SizeBytes] + return GetBytes(b) } // Copy copies src to dst. @@ -281,16 +256,12 @@ func (int16Traits) PutValue(b []byte, v int16) { // // NOTE: len(b) must be a multiple of Int16SizeBytes. func (int16Traits) CastFromBytes(b []byte) []int16 { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*int16)(unsafe.Pointer(h.Data)), cap(b)/Int16SizeBytes)[:len(b)/Int16SizeBytes] + return GetData[int16](b) } // CastToBytes reinterprets the slice b to a slice of bytes. func (int16Traits) CastToBytes(b []int16) []byte { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*byte)(unsafe.Pointer(h.Data)), cap(b)*Int16SizeBytes)[:len(b)*Int16SizeBytes] + return GetBytes(b) } // Copy copies src to dst. @@ -317,16 +288,12 @@ func (uint16Traits) PutValue(b []byte, v uint16) { // // NOTE: len(b) must be a multiple of Uint16SizeBytes. func (uint16Traits) CastFromBytes(b []byte) []uint16 { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*uint16)(unsafe.Pointer(h.Data)), cap(b)/Uint16SizeBytes)[:len(b)/Uint16SizeBytes] + return GetData[uint16](b) } // CastToBytes reinterprets the slice b to a slice of bytes. func (uint16Traits) CastToBytes(b []uint16) []byte { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*byte)(unsafe.Pointer(h.Data)), cap(b)*Uint16SizeBytes)[:len(b)*Uint16SizeBytes] + return GetBytes(b) } // Copy copies src to dst. @@ -353,16 +320,12 @@ func (int8Traits) PutValue(b []byte, v int8) { // // NOTE: len(b) must be a multiple of Int8SizeBytes. func (int8Traits) CastFromBytes(b []byte) []int8 { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*int8)(unsafe.Pointer(h.Data)), cap(b)/Int8SizeBytes)[:len(b)/Int8SizeBytes] + return GetData[int8](b) } // CastToBytes reinterprets the slice b to a slice of bytes. func (int8Traits) CastToBytes(b []int8) []byte { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*byte)(unsafe.Pointer(h.Data)), cap(b)*Int8SizeBytes)[:len(b)*Int8SizeBytes] + return GetBytes(b) } // Copy copies src to dst. @@ -389,16 +352,12 @@ func (uint8Traits) PutValue(b []byte, v uint8) { // // NOTE: len(b) must be a multiple of Uint8SizeBytes. func (uint8Traits) CastFromBytes(b []byte) []uint8 { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*uint8)(unsafe.Pointer(h.Data)), cap(b)/Uint8SizeBytes)[:len(b)/Uint8SizeBytes] + return GetData[uint8](b) } // CastToBytes reinterprets the slice b to a slice of bytes. func (uint8Traits) CastToBytes(b []uint8) []byte { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*byte)(unsafe.Pointer(h.Data)), cap(b)*Uint8SizeBytes)[:len(b)*Uint8SizeBytes] + return GetBytes(b) } // Copy copies src to dst. @@ -425,16 +384,12 @@ func (time32Traits) PutValue(b []byte, v Time32) { // // NOTE: len(b) must be a multiple of Time32SizeBytes. func (time32Traits) CastFromBytes(b []byte) []Time32 { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*Time32)(unsafe.Pointer(h.Data)), cap(b)/Time32SizeBytes)[:len(b)/Time32SizeBytes] + return GetData[Time32](b) } // CastToBytes reinterprets the slice b to a slice of bytes. func (time32Traits) CastToBytes(b []Time32) []byte { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*byte)(unsafe.Pointer(h.Data)), cap(b)*Time32SizeBytes)[:len(b)*Time32SizeBytes] + return GetBytes(b) } // Copy copies src to dst. @@ -461,16 +416,12 @@ func (time64Traits) PutValue(b []byte, v Time64) { // // NOTE: len(b) must be a multiple of Time64SizeBytes. func (time64Traits) CastFromBytes(b []byte) []Time64 { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*Time64)(unsafe.Pointer(h.Data)), cap(b)/Time64SizeBytes)[:len(b)/Time64SizeBytes] + return GetData[Time64](b) } // CastToBytes reinterprets the slice b to a slice of bytes. func (time64Traits) CastToBytes(b []Time64) []byte { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*byte)(unsafe.Pointer(h.Data)), cap(b)*Time64SizeBytes)[:len(b)*Time64SizeBytes] + return GetBytes(b) } // Copy copies src to dst. @@ -497,16 +448,12 @@ func (date32Traits) PutValue(b []byte, v Date32) { // // NOTE: len(b) must be a multiple of Date32SizeBytes. func (date32Traits) CastFromBytes(b []byte) []Date32 { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*Date32)(unsafe.Pointer(h.Data)), cap(b)/Date32SizeBytes)[:len(b)/Date32SizeBytes] + return GetData[Date32](b) } // CastToBytes reinterprets the slice b to a slice of bytes. func (date32Traits) CastToBytes(b []Date32) []byte { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*byte)(unsafe.Pointer(h.Data)), cap(b)*Date32SizeBytes)[:len(b)*Date32SizeBytes] + return GetBytes(b) } // Copy copies src to dst. @@ -533,16 +480,12 @@ func (date64Traits) PutValue(b []byte, v Date64) { // // NOTE: len(b) must be a multiple of Date64SizeBytes. func (date64Traits) CastFromBytes(b []byte) []Date64 { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*Date64)(unsafe.Pointer(h.Data)), cap(b)/Date64SizeBytes)[:len(b)/Date64SizeBytes] + return GetData[Date64](b) } // CastToBytes reinterprets the slice b to a slice of bytes. func (date64Traits) CastToBytes(b []Date64) []byte { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*byte)(unsafe.Pointer(h.Data)), cap(b)*Date64SizeBytes)[:len(b)*Date64SizeBytes] + return GetBytes(b) } // Copy copies src to dst. @@ -569,16 +512,12 @@ func (durationTraits) PutValue(b []byte, v Duration) { // // NOTE: len(b) must be a multiple of DurationSizeBytes. func (durationTraits) CastFromBytes(b []byte) []Duration { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*Duration)(unsafe.Pointer(h.Data)), cap(b)/DurationSizeBytes)[:len(b)/DurationSizeBytes] + return GetData[Duration](b) } // CastToBytes reinterprets the slice b to a slice of bytes. func (durationTraits) CastToBytes(b []Duration) []byte { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*byte)(unsafe.Pointer(h.Data)), cap(b)*DurationSizeBytes)[:len(b)*DurationSizeBytes] + return GetBytes(b) } // Copy copies src to dst. diff --git a/go/arrow/type_traits_numeric.gen.go.tmpl b/go/arrow/type_traits_numeric.gen.go.tmpl index c491047b51429..e98f59528c6aa 100644 --- a/go/arrow/type_traits_numeric.gen.go.tmpl +++ b/go/arrow/type_traits_numeric.gen.go.tmpl @@ -18,7 +18,6 @@ package arrow import ( "math" - "reflect" "unsafe" "github.com/apache/arrow/go/v15/arrow/endian" @@ -66,16 +65,12 @@ func ({{.name}}Traits) PutValue(b []byte, v {{.Type}}) { // // NOTE: len(b) must be a multiple of {{.Name}}SizeBytes. func ({{.name}}Traits) CastFromBytes(b []byte) []{{.Type}} { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*{{.Type}})(unsafe.Pointer(h.Data)), cap(b)/{{.Name}}SizeBytes)[:len(b)/{{.Name}}SizeBytes] + return GetData[{{.Type}}](b) } // CastToBytes reinterprets the slice b to a slice of bytes. func ({{.name}}Traits) CastToBytes(b []{{.Type}}) []byte { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*byte)(unsafe.Pointer(h.Data)), cap(b)*{{.Name}}SizeBytes)[:len(b)*{{.Name}}SizeBytes] + return GetBytes(b) } // Copy copies src to dst. diff --git a/go/arrow/type_traits_timestamp.go b/go/arrow/type_traits_timestamp.go index c1a9aba3db386..8e9970a719f54 100644 --- a/go/arrow/type_traits_timestamp.go +++ b/go/arrow/type_traits_timestamp.go @@ -17,7 +17,6 @@ package arrow import ( - "reflect" "unsafe" "github.com/apache/arrow/go/v15/arrow/endian" @@ -43,16 +42,12 @@ func (timestampTraits) PutValue(b []byte, v Timestamp) { // // NOTE: len(b) must be a multiple of TimestampSizeBytes. func (timestampTraits) CastFromBytes(b []byte) []Timestamp { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*Timestamp)(unsafe.Pointer(h.Data)), cap(b)/TimestampSizeBytes)[:len(b)/TimestampSizeBytes] + return GetData[Timestamp](b) } // CastToBytes reinterprets the slice b to a slice of bytes. func (timestampTraits) CastToBytes(b []Timestamp) []byte { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*byte)(unsafe.Pointer(h.Data)), cap(b)*TimestampSizeBytes)[:len(b)*TimestampSizeBytes] + return GetBytes(b) } // Copy copies src to dst. diff --git a/go/arrow/type_traits_view.go b/go/arrow/type_traits_view.go index c3846db294681..be3f15fed69ae 100644 --- a/go/arrow/type_traits_view.go +++ b/go/arrow/type_traits_view.go @@ -17,7 +17,6 @@ package arrow import ( - "reflect" "unsafe" "github.com/apache/arrow/go/v15/arrow/endian" @@ -39,15 +38,11 @@ func (viewHeaderTraits) PutValue(b []byte, v ViewHeader) { } func (viewHeaderTraits) CastFromBytes(b []byte) (res []ViewHeader) { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*ViewHeader)(unsafe.Pointer(h.Data)), cap(b)/ViewHeaderSizeBytes)[:len(b)/ViewHeaderSizeBytes] + return GetData[ViewHeader](b) } func (viewHeaderTraits) CastToBytes(b []ViewHeader) (res []byte) { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - return unsafe.Slice((*byte)(unsafe.Pointer(h.Data)), cap(b)*ViewHeaderSizeBytes)[:len(b)*ViewHeaderSizeBytes] + return GetBytes(b) } func (viewHeaderTraits) Copy(dst, src []ViewHeader) { copy(dst, src) } diff --git a/go/internal/bitutils/bit_set_run_reader.go b/go/internal/bitutils/bit_set_run_reader.go index 6764ca7912679..374b8d4aab39a 100644 --- a/go/internal/bitutils/bit_set_run_reader.go +++ b/go/internal/bitutils/bit_set_run_reader.go @@ -113,7 +113,7 @@ func (br *baseSetBitRunReader) Reset(bitmap []byte, startOffset, length int64) { bitOffset := int8(startOffset % 8) if length > 0 && bitOffset != 0 { - br.curNumBits = int32(utils.MinInt(int(length), int(8-bitOffset))) + br.curNumBits = int32(utils.Min(int(length), int(8-bitOffset))) br.curWord = br.loadPartial(bitOffset, int64(br.curNumBits)) } return @@ -124,7 +124,7 @@ func (br *baseSetBitRunReader) Reset(bitmap []byte, startOffset, length int64) { endBitOffset := int8((startOffset + length) % 8) if length > 0 && endBitOffset != 0 { br.pos++ - br.curNumBits = int32(utils.MinInt(int(length), int(endBitOffset))) + br.curNumBits = int32(utils.Min(int(length), int(endBitOffset))) br.curWord = br.loadPartial(8-endBitOffset, int64(br.curNumBits)) } } @@ -219,7 +219,7 @@ func (br *baseSetBitRunReader) skipNextZeros() { if br.remaining > 0 { br.curWord = br.loadPartial(0, br.remaining) br.curNumBits = int32(br.remaining) - nzeros := int32(utils.MinInt(int(br.curNumBits), int(br.countFirstZeros(br.curWord)))) + nzeros := int32(utils.Min(int(br.curNumBits), int(br.countFirstZeros(br.curWord)))) br.curWord = br.consumeBits(br.curWord, nzeros) br.curNumBits -= nzeros br.remaining -= int64(nzeros) diff --git a/go/internal/utils/math.go b/go/internal/utils/math.go index 62cf96ce43156..c8311750e3a4c 100644 --- a/go/internal/utils/math.go +++ b/go/internal/utils/math.go @@ -16,32 +16,16 @@ package utils -// Min is a convenience Min function for int64 -func Min(a, b int64) int64 { - if a < b { - return a - } - return b -} +import "golang.org/x/exp/constraints" -// MinInt is a convenience Min function for int -func MinInt(a, b int) int { +func Min[T constraints.Ordered](a, b T) T { if a < b { return a } return b } -// Max is a convenience Max function for int64 -func Max(a, b int64) int64 { - if a > b { - return a - } - return b -} - -// MaxInt is a convenience Max function for int -func MaxInt(a, b int) int { +func Max[T constraints.Ordered](a, b T) T { if a > b { return a } diff --git a/go/parquet/file/column_reader.go b/go/parquet/file/column_reader.go index 766638d88f26c..342fb3b198abe 100644 --- a/go/parquet/file/column_reader.go +++ b/go/parquet/file/column_reader.go @@ -517,7 +517,7 @@ func (c *columnChunkReader) readBatch(batchSize int64, defLvls, repLvls []int16, // if this is a required field, ndefs will be 0 since there is no definition // levels stored with it and `read` will be the number of values, otherwise // we use ndefs since it will be equal to or greater than read. - totalVals := int64(utils.MaxInt(ndefs, read)) + totalVals := int64(utils.Max(ndefs, read)) c.consumeBufferedValues(totalVals) totalLvls += totalVals diff --git a/go/parquet/file/column_reader_test.go b/go/parquet/file/column_reader_test.go index 21ea52e2b7bbc..a6725bc02fee0 100755 --- a/go/parquet/file/column_reader_test.go +++ b/go/parquet/file/column_reader_test.go @@ -244,7 +244,7 @@ func (p *PrimitiveReaderSuite) checkResults(typ reflect.Type) { totalRead += batch batchActual += int(read) - batchSize = int32(utils.MinInt(1<<24, utils.MaxInt(int(batchSize*2), 4096))) + batchSize = int32(utils.Min(1<<24, utils.Max(int(batchSize*2), 4096))) if batch <= 0 { break } diff --git a/go/parquet/file/level_conversion.go b/go/parquet/file/level_conversion.go index f6707fce86d80..251468658ae30 100755 --- a/go/parquet/file/level_conversion.go +++ b/go/parquet/file/level_conversion.go @@ -144,7 +144,7 @@ func defLevelsBatchToBitmap(defLevels []int16, remainingUpperBound int64, info L var batch []int16 for len(defLevels) > 0 { - batchSize := shared_utils.MinInt(maxbatch, len(defLevels)) + batchSize := shared_utils.Min(maxbatch, len(defLevels)) batch, defLevels = defLevels[:batchSize], defLevels[batchSize:] definedBitmap := bmi.GreaterThanBitmap(batch, info.DefLevel-1) diff --git a/go/parquet/internal/encoding/boolean_decoder.go b/go/parquet/internal/encoding/boolean_decoder.go index 3782dc85ea814..353f443855952 100644 --- a/go/parquet/internal/encoding/boolean_decoder.go +++ b/go/parquet/internal/encoding/boolean_decoder.go @@ -55,7 +55,7 @@ func (dec *PlainBooleanDecoder) SetData(nvals int, data []byte) error { // // Returns the number of values decoded func (dec *PlainBooleanDecoder) Decode(out []bool) (int, error) { - max := shared_utils.MinInt(len(out), dec.nvals) + max := shared_utils.Min(len(out), dec.nvals) // attempts to read all remaining bool values from the current data byte unalignedExtract := func(i int) int { @@ -148,7 +148,7 @@ func (dec *RleBooleanDecoder) SetData(nvals int, data []byte) error { } func (dec *RleBooleanDecoder) Decode(out []bool) (int, error) { - max := shared_utils.MinInt(len(out), dec.nvals) + max := shared_utils.Min(len(out), dec.nvals) var ( buf [1024]uint64 @@ -156,7 +156,7 @@ func (dec *RleBooleanDecoder) Decode(out []bool) (int, error) { ) for n > 0 { - batch := shared_utils.MinInt(len(buf), n) + batch := shared_utils.Min(len(buf), n) decoded := dec.rleDec.GetBatch(buf[:batch]) if decoded != batch { return max - n, io.ErrUnexpectedEOF diff --git a/go/parquet/internal/encoding/byte_array_decoder.go b/go/parquet/internal/encoding/byte_array_decoder.go index 82ce9f84265c5..0c1c858fb48bb 100644 --- a/go/parquet/internal/encoding/byte_array_decoder.go +++ b/go/parquet/internal/encoding/byte_array_decoder.go @@ -49,7 +49,7 @@ func (PlainByteArrayDecoder) Type() parquet.Type { // // Returns the number of values that were decoded. func (pbad *PlainByteArrayDecoder) Decode(out []parquet.ByteArray) (int, error) { - max := utils.MinInt(len(out), pbad.nvals) + max := utils.Min(len(out), pbad.nvals) for i := 0; i < max; i++ { // there should always be at least four bytes which is the length of the diff --git a/go/parquet/internal/encoding/decoder.go b/go/parquet/internal/encoding/decoder.go index cee624730e993..acb57fbce7806 100644 --- a/go/parquet/internal/encoding/decoder.go +++ b/go/parquet/internal/encoding/decoder.go @@ -155,7 +155,7 @@ func (d *dictDecoder) decodeSpaced(out interface{}, nullCount int, validBits []b } func (d *dictDecoder) DecodeIndices(numValues int, bldr array.Builder) (int, error) { - n := shared_utils.MinInt(numValues, d.nvals) + n := shared_utils.Min(numValues, d.nvals) if cap(d.idxScratchSpace) < n { d.idxScratchSpace = make([]uint64, n, bitutil.NextPowerOf2(n)) } else { diff --git a/go/parquet/internal/encoding/delta_bit_packing.go b/go/parquet/internal/encoding/delta_bit_packing.go index a00f3457cac7a..560b77f4c66ce 100644 --- a/go/parquet/internal/encoding/delta_bit_packing.go +++ b/go/parquet/internal/encoding/delta_bit_packing.go @@ -158,7 +158,7 @@ func (d *DeltaBitPackInt32Decoder) unpackNextMini() error { // Decode retrieves min(remaining values, len(out)) values from the data and returns the number // of values actually decoded and any errors encountered. func (d *DeltaBitPackInt32Decoder) Decode(out []int32) (int, error) { - max := shared_utils.MinInt(len(out), int(d.totalValues)) + max := shared_utils.Min(len(out), int(d.totalValues)) if max == 0 { return 0, nil } @@ -249,7 +249,7 @@ func (d *DeltaBitPackInt64Decoder) unpackNextMini() error { // Decode retrieves min(remaining values, len(out)) values from the data and returns the number // of values actually decoded and any errors encountered. func (d *DeltaBitPackInt64Decoder) Decode(out []int64) (int, error) { - max := shared_utils.MinInt(len(out), d.nvals) + max := shared_utils.Min(len(out), d.nvals) if max == 0 { return 0, nil } diff --git a/go/parquet/internal/encoding/delta_byte_array.go b/go/parquet/internal/encoding/delta_byte_array.go index 57b0c8a70e5ad..5e5002e34a68f 100644 --- a/go/parquet/internal/encoding/delta_byte_array.go +++ b/go/parquet/internal/encoding/delta_byte_array.go @@ -172,7 +172,7 @@ func (d *DeltaByteArrayDecoder) SetData(nvalues int, data []byte) error { // Decode decodes byte arrays into the slice provided and returns the number of values actually decoded func (d *DeltaByteArrayDecoder) Decode(out []parquet.ByteArray) (int, error) { - max := utils.MinInt(len(out), d.nvals) + max := utils.Min(len(out), d.nvals) if max == 0 { return 0, nil } diff --git a/go/parquet/internal/encoding/delta_length_byte_array.go b/go/parquet/internal/encoding/delta_length_byte_array.go index d5a99c187d11e..183eb453ca0a3 100644 --- a/go/parquet/internal/encoding/delta_length_byte_array.go +++ b/go/parquet/internal/encoding/delta_length_byte_array.go @@ -126,7 +126,7 @@ func (d *DeltaLengthByteArrayDecoder) SetData(nvalues int, data []byte) error { // Decode populates the passed in slice with data decoded until it hits the length of out // or runs out of values in the column to decode, then returns the number of values actually decoded. func (d *DeltaLengthByteArrayDecoder) Decode(out []parquet.ByteArray) (int, error) { - max := utils.MinInt(len(out), d.nvals) + max := utils.Min(len(out), d.nvals) for i := 0; i < max; i++ { out[i] = d.data[:d.lengths[i]:d.lengths[i]] d.data = d.data[d.lengths[i]:] diff --git a/go/parquet/internal/encoding/fixed_len_byte_array_decoder.go b/go/parquet/internal/encoding/fixed_len_byte_array_decoder.go index 1e589fc2e7be1..2054e1bb85f21 100644 --- a/go/parquet/internal/encoding/fixed_len_byte_array_decoder.go +++ b/go/parquet/internal/encoding/fixed_len_byte_array_decoder.go @@ -38,7 +38,7 @@ func (PlainFixedLenByteArrayDecoder) Type() parquet.Type { // values to decode or the length of out has been filled. Then returns the total number of values // that were decoded. func (pflba *PlainFixedLenByteArrayDecoder) Decode(out []parquet.FixedLenByteArray) (int, error) { - max := utils.MinInt(len(out), pflba.nvals) + max := utils.Min(len(out), pflba.nvals) numBytesNeeded := max * pflba.typeLen if numBytesNeeded > len(pflba.data) || numBytesNeeded > math.MaxInt32 { return 0, xerrors.New("parquet: eof exception") diff --git a/go/parquet/internal/encoding/plain_encoder_types.gen.go b/go/parquet/internal/encoding/plain_encoder_types.gen.go index 09403d74cb06f..a41f754f62a88 100644 --- a/go/parquet/internal/encoding/plain_encoder_types.gen.go +++ b/go/parquet/internal/encoding/plain_encoder_types.gen.go @@ -172,7 +172,7 @@ func (PlainInt32Decoder) Type() parquet.Type { // decoding the min(len(out), remaining values). // It returns the number of values actually decoded and any error encountered. func (dec *PlainInt32Decoder) Decode(out []int32) (int, error) { - max := utils.MinInt(len(out), dec.nvals) + max := utils.Min(len(out), dec.nvals) nbytes := int64(max) * int64(arrow.Int32SizeBytes) if nbytes > int64(len(dec.data)) || nbytes > math.MaxInt32 { return 0, fmt.Errorf("parquet: eof exception decode plain Int32, nvals: %d, nbytes: %d, datalen: %d", dec.nvals, nbytes, len(dec.data)) @@ -277,7 +277,7 @@ func (PlainInt64Decoder) Type() parquet.Type { // decoding the min(len(out), remaining values). // It returns the number of values actually decoded and any error encountered. func (dec *PlainInt64Decoder) Decode(out []int64) (int, error) { - max := utils.MinInt(len(out), dec.nvals) + max := utils.Min(len(out), dec.nvals) nbytes := int64(max) * int64(arrow.Int64SizeBytes) if nbytes > int64(len(dec.data)) || nbytes > math.MaxInt32 { return 0, fmt.Errorf("parquet: eof exception decode plain Int64, nvals: %d, nbytes: %d, datalen: %d", dec.nvals, nbytes, len(dec.data)) @@ -382,7 +382,7 @@ func (PlainInt96Decoder) Type() parquet.Type { // decoding the min(len(out), remaining values). // It returns the number of values actually decoded and any error encountered. func (dec *PlainInt96Decoder) Decode(out []parquet.Int96) (int, error) { - max := utils.MinInt(len(out), dec.nvals) + max := utils.Min(len(out), dec.nvals) nbytes := int64(max) * int64(parquet.Int96SizeBytes) if nbytes > int64(len(dec.data)) || nbytes > math.MaxInt32 { return 0, fmt.Errorf("parquet: eof exception decode plain Int96, nvals: %d, nbytes: %d, datalen: %d", dec.nvals, nbytes, len(dec.data)) @@ -487,7 +487,7 @@ func (PlainFloat32Decoder) Type() parquet.Type { // decoding the min(len(out), remaining values). // It returns the number of values actually decoded and any error encountered. func (dec *PlainFloat32Decoder) Decode(out []float32) (int, error) { - max := utils.MinInt(len(out), dec.nvals) + max := utils.Min(len(out), dec.nvals) nbytes := int64(max) * int64(arrow.Float32SizeBytes) if nbytes > int64(len(dec.data)) || nbytes > math.MaxInt32 { return 0, fmt.Errorf("parquet: eof exception decode plain Float32, nvals: %d, nbytes: %d, datalen: %d", dec.nvals, nbytes, len(dec.data)) @@ -592,7 +592,7 @@ func (PlainFloat64Decoder) Type() parquet.Type { // decoding the min(len(out), remaining values). // It returns the number of values actually decoded and any error encountered. func (dec *PlainFloat64Decoder) Decode(out []float64) (int, error) { - max := utils.MinInt(len(out), dec.nvals) + max := utils.Min(len(out), dec.nvals) nbytes := int64(max) * int64(arrow.Float64SizeBytes) if nbytes > int64(len(dec.data)) || nbytes > math.MaxInt32 { return 0, fmt.Errorf("parquet: eof exception decode plain Float64, nvals: %d, nbytes: %d, datalen: %d", dec.nvals, nbytes, len(dec.data)) diff --git a/go/parquet/internal/encoding/plain_encoder_types.gen.go.tmpl b/go/parquet/internal/encoding/plain_encoder_types.gen.go.tmpl index 2838c63a41857..74f63e78bccf3 100644 --- a/go/parquet/internal/encoding/plain_encoder_types.gen.go.tmpl +++ b/go/parquet/internal/encoding/plain_encoder_types.gen.go.tmpl @@ -133,7 +133,7 @@ func (Plain{{.Name}}Decoder) Type() parquet.Type { // decoding the min(len(out), remaining values). // It returns the number of values actually decoded and any error encountered. func (dec *Plain{{.Name}}Decoder) Decode(out []{{.name}}) (int, error) { - max := utils.MinInt(len(out), dec.nvals) + max := utils.Min(len(out), dec.nvals) nbytes := int64(max) * int64({{.prefix}}.{{.Name}}SizeBytes) if nbytes > int64(len(dec.data)) || nbytes > math.MaxInt32 { return 0, fmt.Errorf("parquet: eof exception decode plain {{.Name}}, nvals: %d, nbytes: %d, datalen: %d", dec.nvals, nbytes, len(dec.data)) diff --git a/go/parquet/internal/encoding/typed_encoder.gen.go b/go/parquet/internal/encoding/typed_encoder.gen.go index 4bc18e8c63c01..04db72178f3ee 100644 --- a/go/parquet/internal/encoding/typed_encoder.gen.go +++ b/go/parquet/internal/encoding/typed_encoder.gen.go @@ -195,7 +195,7 @@ func (DictInt32Decoder) Type() parquet.Type { // decoding using the dictionary to get the actual values. Returns the number of values // actually decoded and any error encountered. func (d *DictInt32Decoder) Decode(out []int32) (int, error) { - vals := shared_utils.MinInt(len(out), d.nvals) + vals := shared_utils.Min(len(out), d.nvals) decoded, err := d.decode(out[:vals]) if err != nil { return decoded, err @@ -209,7 +209,7 @@ func (d *DictInt32Decoder) Decode(out []int32) (int, error) { // Decode spaced is like Decode but will space out the data leaving slots for null values // based on the provided bitmap. func (d *DictInt32Decoder) DecodeSpaced(out []int32, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { - vals := shared_utils.MinInt(len(out), d.nvals) + vals := shared_utils.Min(len(out), d.nvals) decoded, err := d.decodeSpaced(out[:vals], nullCount, validBits, validBitsOffset) if err != nil { return decoded, err @@ -432,7 +432,7 @@ func (DictInt64Decoder) Type() parquet.Type { // decoding using the dictionary to get the actual values. Returns the number of values // actually decoded and any error encountered. func (d *DictInt64Decoder) Decode(out []int64) (int, error) { - vals := shared_utils.MinInt(len(out), d.nvals) + vals := shared_utils.Min(len(out), d.nvals) decoded, err := d.decode(out[:vals]) if err != nil { return decoded, err @@ -446,7 +446,7 @@ func (d *DictInt64Decoder) Decode(out []int64) (int, error) { // Decode spaced is like Decode but will space out the data leaving slots for null values // based on the provided bitmap. func (d *DictInt64Decoder) DecodeSpaced(out []int64, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { - vals := shared_utils.MinInt(len(out), d.nvals) + vals := shared_utils.Min(len(out), d.nvals) decoded, err := d.decodeSpaced(out[:vals], nullCount, validBits, validBitsOffset) if err != nil { return decoded, err @@ -647,7 +647,7 @@ func (DictInt96Decoder) Type() parquet.Type { // decoding using the dictionary to get the actual values. Returns the number of values // actually decoded and any error encountered. func (d *DictInt96Decoder) Decode(out []parquet.Int96) (int, error) { - vals := shared_utils.MinInt(len(out), d.nvals) + vals := shared_utils.Min(len(out), d.nvals) decoded, err := d.decode(out[:vals]) if err != nil { return decoded, err @@ -661,7 +661,7 @@ func (d *DictInt96Decoder) Decode(out []parquet.Int96) (int, error) { // Decode spaced is like Decode but will space out the data leaving slots for null values // based on the provided bitmap. func (d *DictInt96Decoder) DecodeSpaced(out []parquet.Int96, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { - vals := shared_utils.MinInt(len(out), d.nvals) + vals := shared_utils.Min(len(out), d.nvals) decoded, err := d.decodeSpaced(out[:vals], nullCount, validBits, validBitsOffset) if err != nil { return decoded, err @@ -872,7 +872,7 @@ func (DictFloat32Decoder) Type() parquet.Type { // decoding using the dictionary to get the actual values. Returns the number of values // actually decoded and any error encountered. func (d *DictFloat32Decoder) Decode(out []float32) (int, error) { - vals := shared_utils.MinInt(len(out), d.nvals) + vals := shared_utils.Min(len(out), d.nvals) decoded, err := d.decode(out[:vals]) if err != nil { return decoded, err @@ -886,7 +886,7 @@ func (d *DictFloat32Decoder) Decode(out []float32) (int, error) { // Decode spaced is like Decode but will space out the data leaving slots for null values // based on the provided bitmap. func (d *DictFloat32Decoder) DecodeSpaced(out []float32, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { - vals := shared_utils.MinInt(len(out), d.nvals) + vals := shared_utils.Min(len(out), d.nvals) decoded, err := d.decodeSpaced(out[:vals], nullCount, validBits, validBitsOffset) if err != nil { return decoded, err @@ -1097,7 +1097,7 @@ func (DictFloat64Decoder) Type() parquet.Type { // decoding using the dictionary to get the actual values. Returns the number of values // actually decoded and any error encountered. func (d *DictFloat64Decoder) Decode(out []float64) (int, error) { - vals := shared_utils.MinInt(len(out), d.nvals) + vals := shared_utils.Min(len(out), d.nvals) decoded, err := d.decode(out[:vals]) if err != nil { return decoded, err @@ -1111,7 +1111,7 @@ func (d *DictFloat64Decoder) Decode(out []float64) (int, error) { // Decode spaced is like Decode but will space out the data leaving slots for null values // based on the provided bitmap. func (d *DictFloat64Decoder) DecodeSpaced(out []float64, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { - vals := shared_utils.MinInt(len(out), d.nvals) + vals := shared_utils.Min(len(out), d.nvals) decoded, err := d.decodeSpaced(out[:vals], nullCount, validBits, validBitsOffset) if err != nil { return decoded, err @@ -1365,7 +1365,7 @@ func (DictByteArrayDecoder) Type() parquet.Type { // decoding using the dictionary to get the actual values. Returns the number of values // actually decoded and any error encountered. func (d *DictByteArrayDecoder) Decode(out []parquet.ByteArray) (int, error) { - vals := shared_utils.MinInt(len(out), d.nvals) + vals := shared_utils.Min(len(out), d.nvals) decoded, err := d.decode(out[:vals]) if err != nil { return decoded, err @@ -1379,7 +1379,7 @@ func (d *DictByteArrayDecoder) Decode(out []parquet.ByteArray) (int, error) { // Decode spaced is like Decode but will space out the data leaving slots for null values // based on the provided bitmap. func (d *DictByteArrayDecoder) DecodeSpaced(out []parquet.ByteArray, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { - vals := shared_utils.MinInt(len(out), d.nvals) + vals := shared_utils.Min(len(out), d.nvals) decoded, err := d.decodeSpaced(out[:vals], nullCount, validBits, validBitsOffset) if err != nil { return decoded, err @@ -1544,7 +1544,7 @@ func (DictFixedLenByteArrayDecoder) Type() parquet.Type { // decoding using the dictionary to get the actual values. Returns the number of values // actually decoded and any error encountered. func (d *DictFixedLenByteArrayDecoder) Decode(out []parquet.FixedLenByteArray) (int, error) { - vals := shared_utils.MinInt(len(out), d.nvals) + vals := shared_utils.Min(len(out), d.nvals) decoded, err := d.decode(out[:vals]) if err != nil { return decoded, err @@ -1558,7 +1558,7 @@ func (d *DictFixedLenByteArrayDecoder) Decode(out []parquet.FixedLenByteArray) ( // Decode spaced is like Decode but will space out the data leaving slots for null values // based on the provided bitmap. func (d *DictFixedLenByteArrayDecoder) DecodeSpaced(out []parquet.FixedLenByteArray, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { - vals := shared_utils.MinInt(len(out), d.nvals) + vals := shared_utils.Min(len(out), d.nvals) decoded, err := d.decodeSpaced(out[:vals], nullCount, validBits, validBitsOffset) if err != nil { return decoded, err diff --git a/go/parquet/internal/encoding/typed_encoder.gen.go.tmpl b/go/parquet/internal/encoding/typed_encoder.gen.go.tmpl index d72f31512047a..ceb755caa0b46 100644 --- a/go/parquet/internal/encoding/typed_encoder.gen.go.tmpl +++ b/go/parquet/internal/encoding/typed_encoder.gen.go.tmpl @@ -271,7 +271,7 @@ func (Dict{{.Name}}Decoder) Type() parquet.Type { // decoding using the dictionary to get the actual values. Returns the number of values // actually decoded and any error encountered. func (d *Dict{{.Name}}Decoder) Decode(out []{{.name}}) (int, error) { - vals := shared_utils.MinInt(len(out), d.nvals) + vals := shared_utils.Min(len(out), d.nvals) decoded, err := d.decode(out[:vals]) if err != nil { return decoded, err @@ -285,7 +285,7 @@ func (d *Dict{{.Name}}Decoder) Decode(out []{{.name}}) (int, error) { // Decode spaced is like Decode but will space out the data leaving slots for null values // based on the provided bitmap. func (d *Dict{{.Name}}Decoder) DecodeSpaced(out []{{.name}}, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { - vals := shared_utils.MinInt(len(out), d.nvals) + vals := shared_utils.Min(len(out), d.nvals) decoded, err := d.decodeSpaced(out[:vals], nullCount, validBits, validBitsOffset) if err != nil { return decoded, err diff --git a/go/parquet/internal/encoding/types.go b/go/parquet/internal/encoding/types.go index 4ab3ab1a1c954..f8d860c88a059 100644 --- a/go/parquet/internal/encoding/types.go +++ b/go/parquet/internal/encoding/types.go @@ -185,7 +185,7 @@ func (b *PooledBufferWriter) Reserve(nbytes int) { b.buf = bufferPool.Get().(*memory.Buffer) } - newCap := utils.MaxInt(b.buf.Cap()+b.offset, 256) + newCap := utils.Max(b.buf.Cap()+b.offset, 256) for newCap < b.pos+nbytes { newCap = bitutil.NextPowerOf2(newCap) } @@ -375,7 +375,7 @@ func (b *BufferWriter) Reserve(nbytes int) { if b.buffer == nil { b.buffer = memory.NewResizableBuffer(b.mem) } - newCap := utils.MaxInt(b.buffer.Cap()+b.offset, 256) + newCap := utils.Max(b.buffer.Cap()+b.offset, 256) for newCap < b.pos+nbytes+b.offset { newCap = bitutil.NextPowerOf2(newCap) } diff --git a/go/parquet/internal/testutils/pagebuilder.go b/go/parquet/internal/testutils/pagebuilder.go index 48ac331640087..525921d9631f9 100644 --- a/go/parquet/internal/testutils/pagebuilder.go +++ b/go/parquet/internal/testutils/pagebuilder.go @@ -75,7 +75,7 @@ func (d *DataPageBuilder) appendLevels(lvls []int16, maxLvl int16, e parquet.Enc func (d *DataPageBuilder) AppendDefLevels(lvls []int16, maxLvl int16) { d.defLvlBytesLen = d.appendLevels(lvls, maxLvl, parquet.Encodings.RLE) - d.nvals = utils.MaxInt(len(lvls), d.nvals) + d.nvals = utils.Max(len(lvls), d.nvals) d.defLvlEncoding = parquet.Encodings.RLE d.hasDefLvls = true } @@ -83,7 +83,7 @@ func (d *DataPageBuilder) AppendDefLevels(lvls []int16, maxLvl int16) { func (d *DataPageBuilder) AppendRepLevels(lvls []int16, maxLvl int16) { d.repLvlBytesLen = d.appendLevels(lvls, maxLvl, parquet.Encodings.RLE) - d.nvals = utils.MaxInt(len(lvls), d.nvals) + d.nvals = utils.Max(len(lvls), d.nvals) d.repLvlEncoding = parquet.Encodings.RLE d.hasRepLvls = true } @@ -122,7 +122,7 @@ func (d *DataPageBuilder) AppendValues(desc *schema.Column, values interface{}, panic(err) } - d.nvals = utils.MaxInt(sz, d.nvals) + d.nvals = utils.Max(sz, d.nvals) d.encoding = e d.hasValues = true } @@ -191,7 +191,7 @@ func MakeDataPage(dataPageVersion parquet.DataPageVersion, d *schema.Column, val num = builder.nvals } else { stream.Write(indexBuffer.Bytes()) - num = utils.MaxInt(builder.nvals, nvals) + num = utils.Max(builder.nvals, nvals) } buf := stream.Finish() diff --git a/go/parquet/internal/utils/bit_reader.go b/go/parquet/internal/utils/bit_reader.go index 0bf501e0488cf..d327be5f5253e 100644 --- a/go/parquet/internal/utils/bit_reader.go +++ b/go/parquet/internal/utils/bit_reader.go @@ -266,7 +266,7 @@ func (b *BitReader) GetBatchBools(out []bool) (int, error) { for i < length { // grab byte-aligned bits in a loop since it's more efficient than going // bit by bit when you can grab 8 bools at a time. - unpackSize := utils.MinInt(blen, length-i) / 8 * 8 + unpackSize := utils.Min(blen, length-i) / 8 * 8 n, err := b.reader.Read(buf[:bitutil.BytesForBits(int64(unpackSize))]) if err != nil { return i, err @@ -314,7 +314,7 @@ func (b *BitReader) GetBatch(bits uint, out []uint64) (int, error) { b.reader.Seek(b.byteoffset, io.SeekStart) for i < length { // unpack groups of 32 bytes at a time into a buffer since it's more efficient - unpackSize := utils.MinInt(buflen, length-i) + unpackSize := utils.Min(buflen, length-i) numUnpacked := unpack32(b.reader, b.unpackBuf[:unpackSize], int(bits)) if numUnpacked == 0 { break diff --git a/go/parquet/internal/utils/rle.go b/go/parquet/internal/utils/rle.go index f367e7dc13cee..dffe55402b95a 100644 --- a/go/parquet/internal/utils/rle.go +++ b/go/parquet/internal/utils/rle.go @@ -51,7 +51,7 @@ func MaxRLEBufferSize(width, numValues int) int { minRepeatedRunSize := 1 + int(bitutil.BytesForBits(int64(width))) repeatedMaxSize := int(bitutil.BytesForBits(int64(numValues))) * minRepeatedRunSize - return utils.MaxInt(literalMaxSize, repeatedMaxSize) + return utils.Max(literalMaxSize, repeatedMaxSize) } // Utility classes to do run length encoding (RLE) for fixed bit width values. If runs @@ -370,7 +370,7 @@ func (r *RleDecoder) consumeRepeatCounts(read, batchSize, remain int, run bituti } func (r *RleDecoder) consumeLiteralsUint64(dc DictionaryConverter, vals []uint64, remain int, buf []IndexType, run bitutils.BitRun, bitRdr bitutils.BitRunReader) (int, int, bitutils.BitRun, error) { - batch := utils.MinInt(utils.MinInt(remain, int(r.litCount)), len(buf)) + batch := utils.Min(utils.Min(remain, int(r.litCount)), len(buf)) buf = buf[:batch] n, _ := r.r.GetBatchIndex(uint(r.bitWidth), buf) @@ -388,7 +388,7 @@ func (r *RleDecoder) consumeLiteralsUint64(dc DictionaryConverter, vals []uint64 ) for read < batch { if run.Set { - updateSize := utils.MinInt(batch-read, int(run.Len)) + updateSize := utils.Min(batch-read, int(run.Len)) if err := dc.Copy(vals, buf[read:read+updateSize]); err != nil { return 0, 0, run, err } diff --git a/go/parquet/internal/utils/typed_rle_dict.gen.go b/go/parquet/internal/utils/typed_rle_dict.gen.go index 886d24564db4b..37dc49a695806 100644 --- a/go/parquet/internal/utils/typed_rle_dict.gen.go +++ b/go/parquet/internal/utils/typed_rle_dict.gen.go @@ -130,7 +130,7 @@ func (r *RleDecoder) getspacedInt32(dc DictionaryConverter, vals []int32, batchS } func (r *RleDecoder) consumeLiteralsInt32(dc DictionaryConverter, vals []int32, remain int, buf []IndexType, run bitutils.BitRun, bitRdr bitutils.BitRunReader) (int, int, bitutils.BitRun, error) { - batch := utils.MinInt(utils.MinInt(remain, int(r.litCount)), len(buf)) + batch := utils.Min(utils.Min(remain, int(r.litCount)), len(buf)) buf = buf[:batch] n, _ := r.r.GetBatchIndex(uint(r.bitWidth), buf) @@ -148,7 +148,7 @@ func (r *RleDecoder) consumeLiteralsInt32(dc DictionaryConverter, vals []int32, ) for read < batch { if run.Set { - updateSize := utils.MinInt(batch-read, int(run.Len)) + updateSize := utils.Min(batch-read, int(run.Len)) if err := dc.Copy(vals, buf[read:read+updateSize]); err != nil { return 0, 0, run, err } @@ -185,7 +185,7 @@ func (r *RleDecoder) GetBatchWithDictInt32(dc DictionaryConverter, vals []int32) if !dc.IsValid(idx) { return read, nil } - batch := utils.MinInt(remain, int(r.repCount)) + batch := utils.Min(remain, int(r.repCount)) if err := dc.Fill(vals[:batch], idx); err != nil { return read, err } @@ -193,7 +193,7 @@ func (r *RleDecoder) GetBatchWithDictInt32(dc DictionaryConverter, vals []int32) read += batch vals = vals[batch:] case r.litCount > 0: - litbatch := utils.MinInt(utils.MinInt(remain, int(r.litCount)), 1024) + litbatch := utils.Min(utils.Min(remain, int(r.litCount)), 1024) buf := indexbuffer[:litbatch] n, _ := r.r.GetBatchIndex(uint(r.bitWidth), buf) if n != litbatch { @@ -323,7 +323,7 @@ func (r *RleDecoder) getspacedInt64(dc DictionaryConverter, vals []int64, batchS } func (r *RleDecoder) consumeLiteralsInt64(dc DictionaryConverter, vals []int64, remain int, buf []IndexType, run bitutils.BitRun, bitRdr bitutils.BitRunReader) (int, int, bitutils.BitRun, error) { - batch := utils.MinInt(utils.MinInt(remain, int(r.litCount)), len(buf)) + batch := utils.Min(utils.Min(remain, int(r.litCount)), len(buf)) buf = buf[:batch] n, _ := r.r.GetBatchIndex(uint(r.bitWidth), buf) @@ -341,7 +341,7 @@ func (r *RleDecoder) consumeLiteralsInt64(dc DictionaryConverter, vals []int64, ) for read < batch { if run.Set { - updateSize := utils.MinInt(batch-read, int(run.Len)) + updateSize := utils.Min(batch-read, int(run.Len)) if err := dc.Copy(vals, buf[read:read+updateSize]); err != nil { return 0, 0, run, err } @@ -378,7 +378,7 @@ func (r *RleDecoder) GetBatchWithDictInt64(dc DictionaryConverter, vals []int64) if !dc.IsValid(idx) { return read, nil } - batch := utils.MinInt(remain, int(r.repCount)) + batch := utils.Min(remain, int(r.repCount)) if err := dc.Fill(vals[:batch], idx); err != nil { return read, err } @@ -386,7 +386,7 @@ func (r *RleDecoder) GetBatchWithDictInt64(dc DictionaryConverter, vals []int64) read += batch vals = vals[batch:] case r.litCount > 0: - litbatch := utils.MinInt(utils.MinInt(remain, int(r.litCount)), 1024) + litbatch := utils.Min(utils.Min(remain, int(r.litCount)), 1024) buf := indexbuffer[:litbatch] n, _ := r.r.GetBatchIndex(uint(r.bitWidth), buf) if n != litbatch { @@ -516,7 +516,7 @@ func (r *RleDecoder) getspacedInt96(dc DictionaryConverter, vals []parquet.Int96 } func (r *RleDecoder) consumeLiteralsInt96(dc DictionaryConverter, vals []parquet.Int96, remain int, buf []IndexType, run bitutils.BitRun, bitRdr bitutils.BitRunReader) (int, int, bitutils.BitRun, error) { - batch := utils.MinInt(utils.MinInt(remain, int(r.litCount)), len(buf)) + batch := utils.Min(utils.Min(remain, int(r.litCount)), len(buf)) buf = buf[:batch] n, _ := r.r.GetBatchIndex(uint(r.bitWidth), buf) @@ -534,7 +534,7 @@ func (r *RleDecoder) consumeLiteralsInt96(dc DictionaryConverter, vals []parquet ) for read < batch { if run.Set { - updateSize := utils.MinInt(batch-read, int(run.Len)) + updateSize := utils.Min(batch-read, int(run.Len)) if err := dc.Copy(vals, buf[read:read+updateSize]); err != nil { return 0, 0, run, err } @@ -571,7 +571,7 @@ func (r *RleDecoder) GetBatchWithDictInt96(dc DictionaryConverter, vals []parque if !dc.IsValid(idx) { return read, nil } - batch := utils.MinInt(remain, int(r.repCount)) + batch := utils.Min(remain, int(r.repCount)) if err := dc.Fill(vals[:batch], idx); err != nil { return read, err } @@ -579,7 +579,7 @@ func (r *RleDecoder) GetBatchWithDictInt96(dc DictionaryConverter, vals []parque read += batch vals = vals[batch:] case r.litCount > 0: - litbatch := utils.MinInt(utils.MinInt(remain, int(r.litCount)), 1024) + litbatch := utils.Min(utils.Min(remain, int(r.litCount)), 1024) buf := indexbuffer[:litbatch] n, _ := r.r.GetBatchIndex(uint(r.bitWidth), buf) if n != litbatch { @@ -709,7 +709,7 @@ func (r *RleDecoder) getspacedFloat32(dc DictionaryConverter, vals []float32, ba } func (r *RleDecoder) consumeLiteralsFloat32(dc DictionaryConverter, vals []float32, remain int, buf []IndexType, run bitutils.BitRun, bitRdr bitutils.BitRunReader) (int, int, bitutils.BitRun, error) { - batch := utils.MinInt(utils.MinInt(remain, int(r.litCount)), len(buf)) + batch := utils.Min(utils.Min(remain, int(r.litCount)), len(buf)) buf = buf[:batch] n, _ := r.r.GetBatchIndex(uint(r.bitWidth), buf) @@ -727,7 +727,7 @@ func (r *RleDecoder) consumeLiteralsFloat32(dc DictionaryConverter, vals []float ) for read < batch { if run.Set { - updateSize := utils.MinInt(batch-read, int(run.Len)) + updateSize := utils.Min(batch-read, int(run.Len)) if err := dc.Copy(vals, buf[read:read+updateSize]); err != nil { return 0, 0, run, err } @@ -764,7 +764,7 @@ func (r *RleDecoder) GetBatchWithDictFloat32(dc DictionaryConverter, vals []floa if !dc.IsValid(idx) { return read, nil } - batch := utils.MinInt(remain, int(r.repCount)) + batch := utils.Min(remain, int(r.repCount)) if err := dc.Fill(vals[:batch], idx); err != nil { return read, err } @@ -772,7 +772,7 @@ func (r *RleDecoder) GetBatchWithDictFloat32(dc DictionaryConverter, vals []floa read += batch vals = vals[batch:] case r.litCount > 0: - litbatch := utils.MinInt(utils.MinInt(remain, int(r.litCount)), 1024) + litbatch := utils.Min(utils.Min(remain, int(r.litCount)), 1024) buf := indexbuffer[:litbatch] n, _ := r.r.GetBatchIndex(uint(r.bitWidth), buf) if n != litbatch { @@ -902,7 +902,7 @@ func (r *RleDecoder) getspacedFloat64(dc DictionaryConverter, vals []float64, ba } func (r *RleDecoder) consumeLiteralsFloat64(dc DictionaryConverter, vals []float64, remain int, buf []IndexType, run bitutils.BitRun, bitRdr bitutils.BitRunReader) (int, int, bitutils.BitRun, error) { - batch := utils.MinInt(utils.MinInt(remain, int(r.litCount)), len(buf)) + batch := utils.Min(utils.Min(remain, int(r.litCount)), len(buf)) buf = buf[:batch] n, _ := r.r.GetBatchIndex(uint(r.bitWidth), buf) @@ -920,7 +920,7 @@ func (r *RleDecoder) consumeLiteralsFloat64(dc DictionaryConverter, vals []float ) for read < batch { if run.Set { - updateSize := utils.MinInt(batch-read, int(run.Len)) + updateSize := utils.Min(batch-read, int(run.Len)) if err := dc.Copy(vals, buf[read:read+updateSize]); err != nil { return 0, 0, run, err } @@ -957,7 +957,7 @@ func (r *RleDecoder) GetBatchWithDictFloat64(dc DictionaryConverter, vals []floa if !dc.IsValid(idx) { return read, nil } - batch := utils.MinInt(remain, int(r.repCount)) + batch := utils.Min(remain, int(r.repCount)) if err := dc.Fill(vals[:batch], idx); err != nil { return read, err } @@ -965,7 +965,7 @@ func (r *RleDecoder) GetBatchWithDictFloat64(dc DictionaryConverter, vals []floa read += batch vals = vals[batch:] case r.litCount > 0: - litbatch := utils.MinInt(utils.MinInt(remain, int(r.litCount)), 1024) + litbatch := utils.Min(utils.Min(remain, int(r.litCount)), 1024) buf := indexbuffer[:litbatch] n, _ := r.r.GetBatchIndex(uint(r.bitWidth), buf) if n != litbatch { @@ -1095,7 +1095,7 @@ func (r *RleDecoder) getspacedByteArray(dc DictionaryConverter, vals []parquet.B } func (r *RleDecoder) consumeLiteralsByteArray(dc DictionaryConverter, vals []parquet.ByteArray, remain int, buf []IndexType, run bitutils.BitRun, bitRdr bitutils.BitRunReader) (int, int, bitutils.BitRun, error) { - batch := utils.MinInt(utils.MinInt(remain, int(r.litCount)), len(buf)) + batch := utils.Min(utils.Min(remain, int(r.litCount)), len(buf)) buf = buf[:batch] n, _ := r.r.GetBatchIndex(uint(r.bitWidth), buf) @@ -1113,7 +1113,7 @@ func (r *RleDecoder) consumeLiteralsByteArray(dc DictionaryConverter, vals []par ) for read < batch { if run.Set { - updateSize := utils.MinInt(batch-read, int(run.Len)) + updateSize := utils.Min(batch-read, int(run.Len)) if err := dc.Copy(vals, buf[read:read+updateSize]); err != nil { return 0, 0, run, err } @@ -1150,7 +1150,7 @@ func (r *RleDecoder) GetBatchWithDictByteArray(dc DictionaryConverter, vals []pa if !dc.IsValid(idx) { return read, nil } - batch := utils.MinInt(remain, int(r.repCount)) + batch := utils.Min(remain, int(r.repCount)) if err := dc.Fill(vals[:batch], idx); err != nil { return read, err } @@ -1158,7 +1158,7 @@ func (r *RleDecoder) GetBatchWithDictByteArray(dc DictionaryConverter, vals []pa read += batch vals = vals[batch:] case r.litCount > 0: - litbatch := utils.MinInt(utils.MinInt(remain, int(r.litCount)), 1024) + litbatch := utils.Min(utils.Min(remain, int(r.litCount)), 1024) buf := indexbuffer[:litbatch] n, _ := r.r.GetBatchIndex(uint(r.bitWidth), buf) if n != litbatch { @@ -1288,7 +1288,7 @@ func (r *RleDecoder) getspacedFixedLenByteArray(dc DictionaryConverter, vals []p } func (r *RleDecoder) consumeLiteralsFixedLenByteArray(dc DictionaryConverter, vals []parquet.FixedLenByteArray, remain int, buf []IndexType, run bitutils.BitRun, bitRdr bitutils.BitRunReader) (int, int, bitutils.BitRun, error) { - batch := utils.MinInt(utils.MinInt(remain, int(r.litCount)), len(buf)) + batch := utils.Min(utils.Min(remain, int(r.litCount)), len(buf)) buf = buf[:batch] n, _ := r.r.GetBatchIndex(uint(r.bitWidth), buf) @@ -1306,7 +1306,7 @@ func (r *RleDecoder) consumeLiteralsFixedLenByteArray(dc DictionaryConverter, va ) for read < batch { if run.Set { - updateSize := utils.MinInt(batch-read, int(run.Len)) + updateSize := utils.Min(batch-read, int(run.Len)) if err := dc.Copy(vals, buf[read:read+updateSize]); err != nil { return 0, 0, run, err } @@ -1343,7 +1343,7 @@ func (r *RleDecoder) GetBatchWithDictFixedLenByteArray(dc DictionaryConverter, v if !dc.IsValid(idx) { return read, nil } - batch := utils.MinInt(remain, int(r.repCount)) + batch := utils.Min(remain, int(r.repCount)) if err := dc.Fill(vals[:batch], idx); err != nil { return read, err } @@ -1351,7 +1351,7 @@ func (r *RleDecoder) GetBatchWithDictFixedLenByteArray(dc DictionaryConverter, v read += batch vals = vals[batch:] case r.litCount > 0: - litbatch := utils.MinInt(utils.MinInt(remain, int(r.litCount)), 1024) + litbatch := utils.Min(utils.Min(remain, int(r.litCount)), 1024) buf := indexbuffer[:litbatch] n, _ := r.r.GetBatchIndex(uint(r.bitWidth), buf) if n != litbatch { diff --git a/go/parquet/internal/utils/typed_rle_dict.gen.go.tmpl b/go/parquet/internal/utils/typed_rle_dict.gen.go.tmpl index abcb419055a92..88c7dd979ebf1 100644 --- a/go/parquet/internal/utils/typed_rle_dict.gen.go.tmpl +++ b/go/parquet/internal/utils/typed_rle_dict.gen.go.tmpl @@ -129,7 +129,7 @@ func (r *RleDecoder) getspaced{{.Name}}(dc DictionaryConverter, vals []{{.name}} } func (r *RleDecoder) consumeLiterals{{.Name}}(dc DictionaryConverter, vals []{{.name}}, remain int, buf []IndexType, run bitutils.BitRun, bitRdr bitutils.BitRunReader) (int, int, bitutils.BitRun, error) { - batch := utils.MinInt(utils.MinInt(remain, int(r.litCount)), len(buf)) + batch := utils.Min(utils.Min(remain, int(r.litCount)), len(buf)) buf = buf[:batch] n, _ := r.r.GetBatchIndex(uint(r.bitWidth), buf) @@ -147,7 +147,7 @@ func (r *RleDecoder) consumeLiterals{{.Name}}(dc DictionaryConverter, vals []{{. ) for read < batch { if run.Set { - updateSize := utils.MinInt(batch-read, int(run.Len)) + updateSize := utils.Min(batch-read, int(run.Len)) if err := dc.Copy(vals, buf[read:read+updateSize]); err != nil { return 0, 0, run, err } @@ -184,7 +184,7 @@ func (r *RleDecoder) GetBatchWithDict{{.Name}}(dc DictionaryConverter, vals []{{ if !dc.IsValid(idx) { return read, nil } - batch := utils.MinInt(remain, int(r.repCount)) + batch := utils.Min(remain, int(r.repCount)) if err := dc.Fill(vals[:batch], idx); err != nil { return read, err } @@ -192,7 +192,7 @@ func (r *RleDecoder) GetBatchWithDict{{.Name}}(dc DictionaryConverter, vals []{{ read += batch vals = vals[batch:] case r.litCount > 0: - litbatch := utils.MinInt(utils.MinInt(remain, int(r.litCount)), 1024) + litbatch := utils.Min(utils.Min(remain, int(r.litCount)), 1024) buf := indexbuffer[:litbatch] n, _ := r.r.GetBatchIndex(uint(r.bitWidth), buf) if n != litbatch { diff --git a/go/parquet/pqarrow/column_readers.go b/go/parquet/pqarrow/column_readers.go index 3c38aba5c32a6..a403b2196a80c 100644 --- a/go/parquet/pqarrow/column_readers.go +++ b/go/parquet/pqarrow/column_readers.go @@ -790,7 +790,7 @@ func bigEndianToDecimal128(buf []byte) (decimal128.Num, error) { isNeg := int8(buf[0]) < 0 // 1. extract high bits - highBitsOffset := utils.MaxInt(0, len(buf)-8) + highBitsOffset := utils.Max(0, len(buf)-8) var ( highBits uint64 lowBits uint64 @@ -811,7 +811,7 @@ func bigEndianToDecimal128(buf []byte) (decimal128.Num, error) { } // 2. extract lower bits - lowBitsOffset := utils.MinInt(len(buf), 8) + lowBitsOffset := utils.Min(len(buf), 8) lowBits = uint64FromBigEndianShifted(buf[highBitsOffset:]) if lowBitsOffset == 8 { @@ -850,7 +850,7 @@ func bigEndianToDecimal256(buf []byte) (decimal256.Num, error) { } for wordIdx := 0; wordIdx < 4; wordIdx++ { - wordLen := utils.MinInt(len(buf), arrow.Uint64SizeBytes) + wordLen := utils.Min(len(buf), arrow.Uint64SizeBytes) word := buf[len(buf)-wordLen:] if wordLen == 8 { From 6c326db6a5686a78bc77be662b61236ddbfc66dc Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Tue, 19 Dec 2023 19:58:29 +0100 Subject: [PATCH 073/570] GH-33984: [C++][Python] DLPack implementation for Arrow Arrays (producer) (#38472) ### Rationale for this change DLPack is selected for Array API protocol so it is important to have it implemented for Arrow/PyArrow Arrays also. This is possible for primitive type arrays (int, uint and float) with no validity buffer. Device support is not in scope of this PR (CPU only). ### What changes are included in this PR? - `ExportArray` and `ExportDevice` methods on Arrow C++ Arrays - `__dlpack__` method on the base PyArrow Array class exposing `ExportArray` method - `__dlpack_device__` method on the base PyArrow Array class exposing `ExportDevice` method ### Are these changes tested? Yes, tests are added to `dlpack_test.cc` and `test_array.py`. ### Are there any user-facing changes? No. * Closes: #33984 Lead-authored-by: AlenkaF Co-authored-by: Alenka Frim Co-authored-by: Antoine Pitrou Co-authored-by: Joris Van den Bossche Signed-off-by: Antoine Pitrou --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/c/CMakeLists.txt | 1 + cpp/src/arrow/c/dlpack.cc | 133 ++++++++ cpp/src/arrow/c/dlpack.h | 51 ++++ cpp/src/arrow/c/dlpack_abi.h | 321 ++++++++++++++++++++ cpp/src/arrow/c/dlpack_test.cc | 129 ++++++++ dev/release/rat_exclude_files.txt | 1 + docs/source/python/dlpack.rst | 93 ++++++ docs/source/python/index.rst | 1 + docs/source/python/interchange_protocol.rst | 6 +- python/pyarrow/_dlpack.pxi | 46 +++ python/pyarrow/array.pxi | 38 +++ python/pyarrow/includes/libarrow.pxd | 19 ++ python/pyarrow/lib.pyx | 3 + python/pyarrow/tests/test_dlpack.py | 142 +++++++++ 15 files changed, 982 insertions(+), 3 deletions(-) create mode 100644 cpp/src/arrow/c/dlpack.cc create mode 100644 cpp/src/arrow/c/dlpack.h create mode 100644 cpp/src/arrow/c/dlpack_abi.h create mode 100644 cpp/src/arrow/c/dlpack_test.cc create mode 100644 docs/source/python/dlpack.rst create mode 100644 python/pyarrow/_dlpack.pxi create mode 100644 python/pyarrow/tests/test_dlpack.py diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 46a7aa910633d..00947c6275678 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -192,6 +192,7 @@ set(ARROW_SRCS type_traits.cc visitor.cc c/bridge.cc + c/dlpack.cc io/buffered.cc io/caching.cc io/compressed.cc diff --git a/cpp/src/arrow/c/CMakeLists.txt b/cpp/src/arrow/c/CMakeLists.txt index 3765477ba09cd..81a81cd3f1103 100644 --- a/cpp/src/arrow/c/CMakeLists.txt +++ b/cpp/src/arrow/c/CMakeLists.txt @@ -16,6 +16,7 @@ # under the License. add_arrow_test(bridge_test PREFIX "arrow-c") +add_arrow_test(dlpack_test) add_arrow_benchmark(bridge_benchmark) diff --git a/cpp/src/arrow/c/dlpack.cc b/cpp/src/arrow/c/dlpack.cc new file mode 100644 index 0000000000000..13ee2761b0c11 --- /dev/null +++ b/cpp/src/arrow/c/dlpack.cc @@ -0,0 +1,133 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/c/dlpack.h" + +#include "arrow/array/array_base.h" +#include "arrow/c/dlpack_abi.h" +#include "arrow/device.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" + +namespace arrow::dlpack { + +namespace { + +Result GetDLDataType(const DataType& type) { + DLDataType dtype; + dtype.lanes = 1; + dtype.bits = type.bit_width(); + switch (type.id()) { + case Type::INT8: + case Type::INT16: + case Type::INT32: + case Type::INT64: + dtype.code = DLDataTypeCode::kDLInt; + return dtype; + case Type::UINT8: + case Type::UINT16: + case Type::UINT32: + case Type::UINT64: + dtype.code = DLDataTypeCode::kDLUInt; + return dtype; + case Type::HALF_FLOAT: + case Type::FLOAT: + case Type::DOUBLE: + dtype.code = DLDataTypeCode::kDLFloat; + return dtype; + case Type::BOOL: + // DLPack supports byte-packed boolean values + return Status::TypeError("Bit-packed boolean data type not supported by DLPack."); + default: + return Status::TypeError("DataType is not compatible with DLPack spec: ", + type.ToString()); + } +} + +struct ManagerCtx { + std::shared_ptr array; + DLManagedTensor tensor; +}; + +} // namespace + +Result ExportArray(const std::shared_ptr& arr) { + // Define DLDevice struct nad check if array type is supported + // by the DLPack protocol at the same time. Raise TypeError if not. + // Supported data types: int, uint, float with no validity buffer. + ARROW_ASSIGN_OR_RAISE(auto device, ExportDevice(arr)) + + // Define the DLDataType struct + const DataType& type = *arr->type(); + std::shared_ptr data = arr->data(); + ARROW_ASSIGN_OR_RAISE(auto dlpack_type, GetDLDataType(type)); + + // Create ManagerCtx that will serve as the owner of the DLManagedTensor + std::unique_ptr ctx(new ManagerCtx); + + // Define the data pointer to the DLTensor + // If array is of length 0, data pointer should be NULL + if (arr->length() == 0) { + ctx->tensor.dl_tensor.data = NULL; + } else { + const auto data_offset = data->offset * type.byte_width(); + ctx->tensor.dl_tensor.data = + const_cast(data->buffers[1]->data() + data_offset); + } + + ctx->tensor.dl_tensor.device = device; + ctx->tensor.dl_tensor.ndim = 1; + ctx->tensor.dl_tensor.dtype = dlpack_type; + ctx->tensor.dl_tensor.shape = const_cast(&data->length); + ctx->tensor.dl_tensor.strides = NULL; + ctx->tensor.dl_tensor.byte_offset = 0; + + ctx->array = std::move(data); + ctx->tensor.manager_ctx = ctx.get(); + ctx->tensor.deleter = [](struct DLManagedTensor* self) { + delete reinterpret_cast(self->manager_ctx); + }; + return &ctx.release()->tensor; +} + +Result ExportDevice(const std::shared_ptr& arr) { + // Check if array is supported by the DLPack protocol. + if (arr->null_count() > 0) { + return Status::TypeError("Can only use DLPack on arrays with no nulls."); + } + const DataType& type = *arr->type(); + if (type.id() == Type::BOOL) { + return Status::TypeError("Bit-packed boolean data type not supported by DLPack."); + } + if (!is_integer(type.id()) && !is_floating(type.id())) { + return Status::TypeError("DataType is not compatible with DLPack spec: ", + type.ToString()); + } + + // Define DLDevice struct + DLDevice device; + if (arr->data()->buffers[1]->device_type() == DeviceAllocationType::kCPU) { + device.device_id = 0; + device.device_type = DLDeviceType::kDLCPU; + return device; + } else { + return Status::NotImplemented( + "DLPack support is implemented only for buffers on CPU device."); + } +} + +} // namespace arrow::dlpack diff --git a/cpp/src/arrow/c/dlpack.h b/cpp/src/arrow/c/dlpack.h new file mode 100644 index 0000000000000..d11ccfc1fd722 --- /dev/null +++ b/cpp/src/arrow/c/dlpack.h @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/array/array_base.h" +#include "arrow/c/dlpack_abi.h" + +namespace arrow::dlpack { + +/// \brief Export Arrow array as DLPack tensor. +/// +/// DLMangedTensor is produced as defined by the DLPack protocol, +/// see https://dmlc.github.io/dlpack/latest/. +/// +/// Data types for which the protocol is supported are +/// integer and floating-point data types. +/// +/// DLPack protocol only supports arrays with one contiguous +/// memory region which means Arrow Arrays with validity buffers +/// are not supported. +/// +/// \param[in] arr Arrow array +/// \return DLManagedTensor struct +ARROW_EXPORT +Result ExportArray(const std::shared_ptr& arr); + +/// \brief Get DLDevice with enumerator specifying the +/// type of the device data is stored on and index of the +/// device which is 0 by default for CPU. +/// +/// \param[in] arr Arrow array +/// \return DLDevice struct +ARROW_EXPORT +Result ExportDevice(const std::shared_ptr& arr); + +} // namespace arrow::dlpack diff --git a/cpp/src/arrow/c/dlpack_abi.h b/cpp/src/arrow/c/dlpack_abi.h new file mode 100644 index 0000000000000..4af557a7ed5d7 --- /dev/null +++ b/cpp/src/arrow/c/dlpack_abi.h @@ -0,0 +1,321 @@ +// Taken from: +// https://github.com/dmlc/dlpack/blob/ca4d00ad3e2e0f410eeab3264d21b8a39397f362/include/dlpack/dlpack.h +/*! + * Copyright (c) 2017 by Contributors + * \file dlpack.h + * \brief The common header of DLPack. + */ +#ifndef DLPACK_DLPACK_H_ +#define DLPACK_DLPACK_H_ + +/** + * \brief Compatibility with C++ + */ +#ifdef __cplusplus +#define DLPACK_EXTERN_C extern "C" +#else +#define DLPACK_EXTERN_C +#endif + +/*! \brief The current major version of dlpack */ +#define DLPACK_MAJOR_VERSION 1 + +/*! \brief The current minor version of dlpack */ +#define DLPACK_MINOR_VERSION 0 + +/*! \brief DLPACK_DLL prefix for windows */ +#ifdef _WIN32 +#ifdef DLPACK_EXPORTS +#define DLPACK_DLL __declspec(dllexport) +#else +#define DLPACK_DLL __declspec(dllimport) +#endif +#else +#define DLPACK_DLL +#endif + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*! + * \brief The DLPack version. + * + * A change in major version indicates that we have changed the + * data layout of the ABI - DLManagedTensorVersioned. + * + * A change in minor version indicates that we have added new + * code, such as a new device type, but the ABI is kept the same. + * + * If an obtained DLPack tensor has a major version that disagrees + * with the version number specified in this header file + * (i.e. major != DLPACK_MAJOR_VERSION), the consumer must call the deleter + * (and it is safe to do so). It is not safe to access any other fields + * as the memory layout will have changed. + * + * In the case of a minor version mismatch, the tensor can be safely used as + * long as the consumer knows how to interpret all fields. Minor version + * updates indicate the addition of enumeration values. + */ +typedef struct { + /*! \brief DLPack major version. */ + uint32_t major; + /*! \brief DLPack minor version. */ + uint32_t minor; +} DLPackVersion; + +/*! + * \brief The device type in DLDevice. + */ +#ifdef __cplusplus +typedef enum : int32_t { +#else +typedef enum { +#endif + /*! \brief CPU device */ + kDLCPU = 1, + /*! \brief CUDA GPU device */ + kDLCUDA = 2, + /*! + * \brief Pinned CUDA CPU memory by cudaMallocHost + */ + kDLCUDAHost = 3, + /*! \brief OpenCL devices. */ + kDLOpenCL = 4, + /*! \brief Vulkan buffer for next generation graphics. */ + kDLVulkan = 7, + /*! \brief Metal for Apple GPU. */ + kDLMetal = 8, + /*! \brief Verilog simulator buffer */ + kDLVPI = 9, + /*! \brief ROCm GPUs for AMD GPUs */ + kDLROCM = 10, + /*! + * \brief Pinned ROCm CPU memory allocated by hipMallocHost + */ + kDLROCMHost = 11, + /*! + * \brief Reserved extension device type, + * used for quickly test extension device + * The semantics can differ depending on the implementation. + */ + kDLExtDev = 12, + /*! + * \brief CUDA managed/unified memory allocated by cudaMallocManaged + */ + kDLCUDAManaged = 13, + /*! + * \brief Unified shared memory allocated on a oneAPI non-partititioned + * device. Call to oneAPI runtime is required to determine the device + * type, the USM allocation type and the sycl context it is bound to. + * + */ + kDLOneAPI = 14, + /*! \brief GPU support for next generation WebGPU standard. */ + kDLWebGPU = 15, + /*! \brief Qualcomm Hexagon DSP */ + kDLHexagon = 16, +} DLDeviceType; + +/*! + * \brief A Device for Tensor and operator. + */ +typedef struct { + /*! \brief The device type used in the device. */ + DLDeviceType device_type; + /*! + * \brief The device index. + * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0. + */ + int32_t device_id; +} DLDevice; + +/*! + * \brief The type code options DLDataType. + */ +typedef enum { + /*! \brief signed integer */ + kDLInt = 0U, + /*! \brief unsigned integer */ + kDLUInt = 1U, + /*! \brief IEEE floating point */ + kDLFloat = 2U, + /*! + * \brief Opaque handle type, reserved for testing purposes. + * Frameworks need to agree on the handle data type for the exchange to be well-defined. + */ + kDLOpaqueHandle = 3U, + /*! \brief bfloat16 */ + kDLBfloat = 4U, + /*! + * \brief complex number + * (C/C++/Python layout: compact struct per complex number) + */ + kDLComplex = 5U, + /*! \brief boolean */ + kDLBool = 6U, +} DLDataTypeCode; + +/*! + * \brief The data type the tensor can hold. The data type is assumed to follow the + * native endian-ness. An explicit error message should be raised when attempting to + * export an array with non-native endianness + * + * Examples + * - float: type_code = 2, bits = 32, lanes = 1 + * - float4(vectorized 4 float): type_code = 2, bits = 32, lanes = 4 + * - int8: type_code = 0, bits = 8, lanes = 1 + * - std::complex: type_code = 5, bits = 64, lanes = 1 + * - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, + * the underlying storage size of bool is 8 bits) + */ +typedef struct { + /*! + * \brief Type code of base types. + * We keep it uint8_t instead of DLDataTypeCode for minimal memory + * footprint, but the value should be one of DLDataTypeCode enum values. + * */ + uint8_t code; + /*! + * \brief Number of bits, common choices are 8, 16, 32. + */ + uint8_t bits; + /*! \brief Number of lanes in the type, used for vector types. */ + uint16_t lanes; +} DLDataType; + +/*! + * \brief Plain C Tensor object, does not manage memory. + */ +typedef struct { + /*! + * \brief The data pointer points to the allocated data. This will be CUDA + * device pointer or cl_mem handle in OpenCL. It may be opaque on some device + * types. This pointer is always aligned to 256 bytes as in CUDA. The + * `byte_offset` field should be used to point to the beginning of the data. + * + * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow, + * TVM, perhaps others) do not adhere to this 256 byte aligment requirement + * on CPU/CUDA/ROCm, and always use `byte_offset=0`. This must be fixed + * (after which this note will be updated); at the moment it is recommended + * to not rely on the data pointer being correctly aligned. + * + * For given DLTensor, the size of memory required to store the contents of + * data is calculated as follows: + * + * \code{.c} + * static inline size_t GetDataSize(const DLTensor* t) { + * size_t size = 1; + * for (tvm_index_t i = 0; i < t->ndim; ++i) { + * size *= t->shape[i]; + * } + * size *= (t->dtype.bits * t->dtype.lanes + 7) / 8; + * return size; + * } + * \endcode + */ + void* data; + /*! \brief The device of the tensor */ + DLDevice device; + /*! \brief Number of dimensions */ + int32_t ndim; + /*! \brief The data type of the pointer*/ + DLDataType dtype; + /*! \brief The shape of the tensor */ + int64_t* shape; + /*! + * \brief strides of the tensor (in number of elements, not bytes) + * can be NULL, indicating tensor is compact and row-majored. + */ + int64_t* strides; + /*! \brief The offset in bytes to the beginning pointer to data */ + uint64_t byte_offset; +} DLTensor; + +/*! + * \brief C Tensor object, manage memory of DLTensor. This data structure is + * intended to facilitate the borrowing of DLTensor by another framework. It is + * not meant to transfer the tensor. When the borrowing framework doesn't need + * the tensor, it should call the deleter to notify the host that the resource + * is no longer needed. + * + * \note This data structure is used as Legacy DLManagedTensor + * in DLPack exchange and is deprecated after DLPack v0.8 + * Use DLManagedTensorVersioned instead. + * This data structure may get renamed or deleted in future versions. + * + * \sa DLManagedTensorVersioned + */ +typedef struct DLManagedTensor { + /*! \brief DLTensor which is being memory managed */ + DLTensor dl_tensor; + /*! \brief the context of the original host framework of DLManagedTensor in + * which DLManagedTensor is used in the framework. It can also be NULL. + */ + void* manager_ctx; + /*! + * \brief Destructor - this should be called + * to destruct the manager_ctx which backs the DLManagedTensor. It can be + * NULL if there is no way for the caller to provide a reasonable destructor. + * The destructors deletes the argument self as well. + */ + void (*deleter)(struct DLManagedTensor* self); +} DLManagedTensor; + +// bit masks used in in the DLManagedTensorVersioned + +/*! \brief bit mask to indicate that the tensor is read only. */ +#define DLPACK_FLAG_BITMASK_READ_ONLY (1UL << 0UL) + +/*! + * \brief A versioned and managed C Tensor object, manage memory of DLTensor. + * + * This data structure is intended to facilitate the borrowing of DLTensor by + * another framework. It is not meant to transfer the tensor. When the borrowing + * framework doesn't need the tensor, it should call the deleter to notify the + * host that the resource is no longer needed. + * + * \note This is the current standard DLPack exchange data structure. + */ +struct DLManagedTensorVersioned { + /*! + * \brief The API and ABI version of the current managed Tensor + */ + DLPackVersion version; + /*! + * \brief the context of the original host framework. + * + * Stores DLManagedTensorVersioned is used in the + * framework. It can also be NULL. + */ + void* manager_ctx; + /*! + * \brief Destructor. + * + * This should be called to destruct manager_ctx which holds the + * DLManagedTensorVersioned. It can be NULL if there is no way for the caller to provide + * a reasonable destructor. The destructors deletes the argument self as well. + */ + void (*deleter)(struct DLManagedTensorVersioned* self); + /*! + * \brief Additional bitmask flags information about the tensor. + * + * By default the flags should be set to 0. + * + * \note Future ABI changes should keep everything until this field + * stable, to ensure that deleter can be correctly called. + * + * \sa DLPACK_FLAG_BITMASK_READ_ONLY + */ + uint64_t flags; + /*! \brief DLTensor which is being memory managed */ + DLTensor dl_tensor; +}; + +#ifdef __cplusplus +} // DLPACK_EXTERN_C +#endif +#endif // DLPACK_DLPACK_H_ diff --git a/cpp/src/arrow/c/dlpack_test.cc b/cpp/src/arrow/c/dlpack_test.cc new file mode 100644 index 0000000000000..3136506bf39ad --- /dev/null +++ b/cpp/src/arrow/c/dlpack_test.cc @@ -0,0 +1,129 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/array/array_base.h" +#include "arrow/c/dlpack.h" +#include "arrow/c/dlpack_abi.h" +#include "arrow/memory_pool.h" +#include "arrow/testing/gtest_util.h" + +namespace arrow::dlpack { + +class TestExportArray : public ::testing::Test { + public: + void SetUp() {} +}; + +void CheckDLTensor(const std::shared_ptr& arr, + const std::shared_ptr& arrow_type, + DLDataTypeCode dlpack_type, int64_t length) { + ASSERT_OK_AND_ASSIGN(auto dlmtensor, arrow::dlpack::ExportArray(arr)); + auto dltensor = dlmtensor->dl_tensor; + + const auto byte_width = arr->type()->byte_width(); + const auto start = arr->offset() * byte_width; + ASSERT_OK_AND_ASSIGN(auto sliced_buffer, + SliceBufferSafe(arr->data()->buffers[1], start)); + ASSERT_EQ(sliced_buffer->data(), dltensor.data); + + ASSERT_EQ(0, dltensor.byte_offset); + ASSERT_EQ(NULL, dltensor.strides); + ASSERT_EQ(length, dltensor.shape[0]); + ASSERT_EQ(1, dltensor.ndim); + + ASSERT_EQ(dlpack_type, dltensor.dtype.code); + + ASSERT_EQ(arrow_type->bit_width(), dltensor.dtype.bits); + ASSERT_EQ(1, dltensor.dtype.lanes); + ASSERT_EQ(DLDeviceType::kDLCPU, dltensor.device.device_type); + ASSERT_EQ(0, dltensor.device.device_id); + + ASSERT_OK_AND_ASSIGN(auto device, arrow::dlpack::ExportDevice(arr)); + ASSERT_EQ(DLDeviceType::kDLCPU, device.device_type); + ASSERT_EQ(0, device.device_id); + + dlmtensor->deleter(dlmtensor); +} + +TEST_F(TestExportArray, TestSupportedArray) { + const std::vector, DLDataTypeCode>> cases = { + {int8(), DLDataTypeCode::kDLInt}, + {uint8(), DLDataTypeCode::kDLUInt}, + { + int16(), + DLDataTypeCode::kDLInt, + }, + {uint16(), DLDataTypeCode::kDLUInt}, + { + int32(), + DLDataTypeCode::kDLInt, + }, + {uint32(), DLDataTypeCode::kDLUInt}, + { + int64(), + DLDataTypeCode::kDLInt, + }, + {uint64(), DLDataTypeCode::kDLUInt}, + {float16(), DLDataTypeCode::kDLFloat}, + {float32(), DLDataTypeCode::kDLFloat}, + {float64(), DLDataTypeCode::kDLFloat}}; + + const auto allocated_bytes = arrow::default_memory_pool()->bytes_allocated(); + + for (auto [arrow_type, dlpack_type] : cases) { + const std::shared_ptr array = + ArrayFromJSON(arrow_type, "[1, 0, 10, 0, 2, 1, 3, 5, 1, 0]"); + CheckDLTensor(array, arrow_type, dlpack_type, 10); + ASSERT_OK_AND_ASSIGN(auto sliced_1, array->SliceSafe(1, 5)); + CheckDLTensor(sliced_1, arrow_type, dlpack_type, 5); + ASSERT_OK_AND_ASSIGN(auto sliced_2, array->SliceSafe(0, 5)); + CheckDLTensor(sliced_2, arrow_type, dlpack_type, 5); + ASSERT_OK_AND_ASSIGN(auto sliced_3, array->SliceSafe(3)); + CheckDLTensor(sliced_3, arrow_type, dlpack_type, 7); + } + + ASSERT_EQ(allocated_bytes, arrow::default_memory_pool()->bytes_allocated()); +} + +TEST_F(TestExportArray, TestErrors) { + const std::shared_ptr array_null = ArrayFromJSON(null(), "[]"); + ASSERT_RAISES_WITH_MESSAGE(TypeError, + "Type error: DataType is not compatible with DLPack spec: " + + array_null->type()->ToString(), + arrow::dlpack::ExportArray(array_null)); + + const std::shared_ptr array_with_null = ArrayFromJSON(int8(), "[1, 100, null]"); + ASSERT_RAISES_WITH_MESSAGE(TypeError, + "Type error: Can only use DLPack on arrays with no nulls.", + arrow::dlpack::ExportArray(array_with_null)); + + const std::shared_ptr array_string = + ArrayFromJSON(utf8(), R"(["itsy", "bitsy", "spider"])"); + ASSERT_RAISES_WITH_MESSAGE(TypeError, + "Type error: DataType is not compatible with DLPack spec: " + + array_string->type()->ToString(), + arrow::dlpack::ExportArray(array_string)); + + const std::shared_ptr array_boolean = ArrayFromJSON(boolean(), "[true, false]"); + ASSERT_RAISES_WITH_MESSAGE( + TypeError, "Type error: Bit-packed boolean data type not supported by DLPack.", + arrow::dlpack::ExportDevice(array_boolean)); +} + +} // namespace arrow::dlpack diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index ce637bf839232..4f86a12afe4fb 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -12,6 +12,7 @@ ci/etc/*.patch ci/vcpkg/*.patch CHANGELOG.md cpp/CHANGELOG_PARQUET.md +cpp/src/arrow/c/dlpack_abi.h cpp/src/arrow/io/mman.h cpp/src/arrow/util/random.h cpp/src/arrow/status.cc diff --git a/docs/source/python/dlpack.rst b/docs/source/python/dlpack.rst new file mode 100644 index 0000000000000..f612ebabde5c9 --- /dev/null +++ b/docs/source/python/dlpack.rst @@ -0,0 +1,93 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _pyarrow-dlpack: + +The DLPack Protocol +=================== + +`The DLPack Protocol `_ +is a stable in-memory data structure that allows exchange +between major frameworks working with multidimensional +arrays or tensors. It is designed for cross hardware +support meaning it allows exchange of data on devices other +than the CPU (e.g. GPU). + +DLPack protocol had been +`selected as the Python array API standard `_ +by the +`Consortium for Python Data API Standards `_ +in order to enable device aware data interchange between array/tensor +libraries in the Python ecosystem. See more about the standard +in the +`protocol documentation `_ +and more about DLPack in the +`Python Specification for DLPack `_. + +Implementation of DLPack in PyArrow +----------------------------------- + +The producing side of the DLPack Protocol is implemented for ``pa.Array`` +and can be used to interchange data between PyArrow and other tensor +libraries. Supported data types are integer, unsigned integer and float. The +protocol has no missing data support meaning PyArrow arrays with +missing values cannot be transferred through the DLPack +protocol. Currently, the Arrow implementation of the protocol only supports +data on a CPU device. + +Data interchange syntax of the protocol includes + +1. ``from_dlpack(x)``: consuming an array object that implements a + ``__dlpack__`` method and creating a new array while sharing the + memory. + +2. ``__dlpack__(self, stream=None)`` and ``__dlpack_device__``: + producing a PyCapsule with the DLPack struct which is called from + within ``from_dlpack(x)``. + +PyArrow implements the second part of the protocol +(``__dlpack__(self, stream=None)`` and ``__dlpack_device__``) and can +thus be consumed by libraries implementing ``from_dlpack``. + +Example +------- + +Convert a PyArrow CPU array to NumPy array: + +.. code-block:: + + >>> import pyarrow as pa + >>> array = pa.array([2, 0, 2, 4]) + + [ + 2, + 0, + 2, + 4 + ] + + >>> import numpy as np + >>> np.from_dlpack(array) + array([2, 0, 2, 4]) + +Convert a PyArrow CPU array to PyTorch tensor: + +.. code-block:: + + >>> import torch + >>> torch.from_dlpack(array) + tensor([2, 0, 2, 4]) diff --git a/docs/source/python/index.rst b/docs/source/python/index.rst index 6a3de3d42b149..08939bc760df6 100644 --- a/docs/source/python/index.rst +++ b/docs/source/python/index.rst @@ -53,6 +53,7 @@ files into Arrow structures. numpy pandas interchange_protocol + dlpack timestamps orc csv diff --git a/docs/source/python/interchange_protocol.rst b/docs/source/python/interchange_protocol.rst index c354541a6779c..2a5ec8afede7b 100644 --- a/docs/source/python/interchange_protocol.rst +++ b/docs/source/python/interchange_protocol.rst @@ -37,7 +37,7 @@ libraries in the Python ecosystem. See more about the standard in the `protocol documentation `_. -From pyarrow to other libraries: ``__dataframe__()`` method +From PyArrow to other libraries: ``__dataframe__()`` method ----------------------------------------------------------- The ``__dataframe__()`` method creates a new exchange object that @@ -54,7 +54,7 @@ This is meant to be used by the consumer library when calling the ``from_dataframe()`` function and is not meant to be used manually by the user. -From other libraries to pyarrow: ``from_dataframe()`` +From other libraries to PyArrow: ``from_dataframe()`` ----------------------------------------------------- With the ``from_dataframe()`` function, we can construct a :class:`pyarrow.Table` @@ -63,7 +63,7 @@ from any dataframe object that implements the protocol. We can for example take a pandas dataframe and construct a -pyarrow table with the use of the interchange protocol: +PyArrow table with the use of the interchange protocol: .. code-block:: diff --git a/python/pyarrow/_dlpack.pxi b/python/pyarrow/_dlpack.pxi new file mode 100644 index 0000000000000..c2f4cff640691 --- /dev/null +++ b/python/pyarrow/_dlpack.pxi @@ -0,0 +1,46 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +cimport cpython +from cpython.pycapsule cimport PyCapsule_New + + +cdef void dlpack_pycapsule_deleter(object dltensor) noexcept: + cdef DLManagedTensor* dlm_tensor + cdef PyObject* err_type + cdef PyObject* err_value + cdef PyObject* err_traceback + + # Do nothing if the capsule has been consumed + if cpython.PyCapsule_IsValid(dltensor, "used_dltensor"): + return + + # An exception may be in-flight, we must save it in case + # we create another one + cpython.PyErr_Fetch(&err_type, &err_value, &err_traceback) + + dlm_tensor = cpython.PyCapsule_GetPointer(dltensor, 'dltensor') + if dlm_tensor == NULL: + cpython.PyErr_WriteUnraisable(dltensor) + # The deleter can be NULL if there is no way for the caller + # to provide a reasonable destructor + elif dlm_tensor.deleter: + dlm_tensor.deleter(dlm_tensor) + assert (not cpython.PyErr_Occurred()) + + # Set the error indicator from err_type, err_value, err_traceback + cpython.PyErr_Restore(err_type, err_value, err_traceback) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 789e30d3e9b00..74a196002bfa6 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1779,6 +1779,44 @@ cdef class Array(_PandasConvertible): return pyarrow_wrap_array(array) + def __dlpack__(self, stream=None): + """Export a primitive array as a DLPack capsule. + + Parameters + ---------- + stream : int, optional + A Python integer representing a pointer to a stream. Currently not supported. + Stream is provided by the consumer to the producer to instruct the producer + to ensure that operations can safely be performed on the array. + + Returns + ------- + capsule : PyCapsule + A DLPack capsule for the array, pointing to a DLManagedTensor. + """ + if stream is None: + dlm_tensor = GetResultValue(ExportToDLPack(self.sp_array)) + + return PyCapsule_New(dlm_tensor, 'dltensor', dlpack_pycapsule_deleter) + else: + raise NotImplementedError( + "Only stream=None is supported." + ) + + def __dlpack_device__(self): + """ + Return the DLPack device tuple this arrays resides on. + + Returns + ------- + tuple : Tuple[int, int] + Tuple with index specifying the type of the device (where + CPU = 1, see cpp/src/arrow/c/dpack_abi.h) and index of the + device which is 0 by default for CPU. + """ + device = GetResultValue(ExportDevice(self.sp_array)) + return device.device_type, device.device_id + cdef _array_like_to_pandas(obj, options, types_mapper): cdef: diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 403846a38f3fd..bad5ec606c268 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1199,6 +1199,25 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CScalar] MakeNullScalar(shared_ptr[CDataType] type) +cdef extern from "arrow/c/dlpack_abi.h" nogil: + ctypedef enum DLDeviceType: + kDLCPU = 1 + + ctypedef struct DLDevice: + DLDeviceType device_type + int32_t device_id + + ctypedef struct DLManagedTensor: + void (*deleter)(DLManagedTensor*) + + +cdef extern from "arrow/c/dlpack.h" namespace "arrow::dlpack" nogil: + CResult[DLManagedTensor*] ExportToDLPack" arrow::dlpack::ExportArray"( + const shared_ptr[CArray]& arr) + + CResult[DLDevice] ExportDevice(const shared_ptr[CArray]& arr) + + cdef extern from "arrow/builder.h" namespace "arrow" nogil: cdef cppclass CArrayBuilder" arrow::ArrayBuilder": diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx index 57fb0f42e38bf..29a0bed55949c 100644 --- a/python/pyarrow/lib.pyx +++ b/python/pyarrow/lib.pyx @@ -176,6 +176,9 @@ include "table.pxi" # Tensors include "tensor.pxi" +# DLPack +include "_dlpack.pxi" + # File IO include "io.pxi" diff --git a/python/pyarrow/tests/test_dlpack.py b/python/pyarrow/tests/test_dlpack.py new file mode 100644 index 0000000000000..7cf3f4acdbd40 --- /dev/null +++ b/python/pyarrow/tests/test_dlpack.py @@ -0,0 +1,142 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import ctypes +from functools import wraps +import pytest + +import numpy as np + +import pyarrow as pa +from pyarrow.vendored.version import Version + + +def PyCapsule_IsValid(capsule, name): + return ctypes.pythonapi.PyCapsule_IsValid(ctypes.py_object(capsule), name) == 1 + + +def check_dlpack_export(arr, expected_arr): + DLTensor = arr.__dlpack__() + assert PyCapsule_IsValid(DLTensor, b"dltensor") is True + + result = np.from_dlpack(arr) + np.testing.assert_array_equal(result, expected_arr, strict=True) + + assert arr.__dlpack_device__() == (1, 0) + + +def check_bytes_allocated(f): + @wraps(f) + def wrapper(*args, **kwargs): + allocated_bytes = pa.total_allocated_bytes() + try: + return f(*args, **kwargs) + finally: + assert pa.total_allocated_bytes() == allocated_bytes + return wrapper + + +@check_bytes_allocated +@pytest.mark.parametrize( + ('value_type', 'np_type'), + [ + (pa.uint8(), np.uint8), + (pa.uint16(), np.uint16), + (pa.uint32(), np.uint32), + (pa.uint64(), np.uint64), + (pa.int8(), np.int8), + (pa.int16(), np.int16), + (pa.int32(), np.int32), + (pa.int64(), np.int64), + (pa.float16(), np.float16), + (pa.float32(), np.float32), + (pa.float64(), np.float64), + ] +) +def test_dlpack(value_type, np_type): + if Version(np.__version__) < Version("1.24.0"): + pytest.skip("No dlpack support in numpy versions older than 1.22.0, " + "strict keyword in assert_array_equal added in numpy version " + "1.24.0") + + expected = np.array([1, 2, 3], dtype=np_type) + arr = pa.array(expected, type=value_type) + check_dlpack_export(arr, expected) + + arr_sliced = arr.slice(1, 1) + expected = np.array([2], dtype=np_type) + check_dlpack_export(arr_sliced, expected) + + arr_sliced = arr.slice(0, 1) + expected = np.array([1], dtype=np_type) + check_dlpack_export(arr_sliced, expected) + + arr_sliced = arr.slice(1) + expected = np.array([2, 3], dtype=np_type) + check_dlpack_export(arr_sliced, expected) + + arr_zero = pa.array([], type=value_type) + expected = np.array([], dtype=np_type) + check_dlpack_export(arr_zero, expected) + + +def test_dlpack_not_supported(): + if Version(np.__version__) < Version("1.22.0"): + pytest.skip("No dlpack support in numpy versions older than 1.22.0.") + + arr = pa.array([1, None, 3]) + with pytest.raises(TypeError, match="Can only use DLPack " + "on arrays with no nulls."): + np.from_dlpack(arr) + + arr = pa.array( + [[0, 1], [3, 4]], + type=pa.list_(pa.int32()) + ) + with pytest.raises(TypeError, match="DataType is not compatible with DLPack spec"): + np.from_dlpack(arr) + + arr = pa.array([]) + with pytest.raises(TypeError, match="DataType is not compatible with DLPack spec"): + np.from_dlpack(arr) + + # DLPack doesn't support bit-packed boolean values + arr = pa.array([True, False, True]) + with pytest.raises(TypeError, match="Bit-packed boolean data type " + "not supported by DLPack."): + np.from_dlpack(arr) + + +def test_dlpack_cuda_not_supported(): + cuda = pytest.importorskip("pyarrow.cuda") + + schema = pa.schema([pa.field('f0', pa.int16())]) + a0 = pa.array([1, 2, 3], type=pa.int16()) + batch = pa.record_batch([a0], schema=schema) + + cbuf = cuda.serialize_record_batch(batch, cuda.Context(0)) + cbatch = cuda.read_record_batch(cbuf, batch.schema) + carr = cbatch["f0"] + + # CudaBuffers not yet supported + with pytest.raises(NotImplementedError, match="DLPack support is implemented " + "only for buffers on CPU device."): + np.from_dlpack(carr) + + with pytest.raises(NotImplementedError, match="DLPack support is implemented " + "only for buffers on CPU device."): + carr.__dlpack_device__() From 1c48d69844cb00918be9255f60d7eb0f59792a8b Mon Sep 17 00:00:00 2001 From: Jacob Wujciak-Jens Date: Wed, 20 Dec 2023 00:58:38 +0100 Subject: [PATCH 074/570] MINOR: [R] Update NEWS.md for 14.0.2 (#39286) Update NEWS.md with recent changes Authored-by: Jacob Wujciak-Jens Signed-off-by: Jacob Wujciak-Jens --- r/NEWS.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/r/NEWS.md b/r/NEWS.md index 63f12607d8d1b..ca062b0390a9f 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -21,6 +21,13 @@ # arrow 14.0.2 +## Minor improvements and fixes + +* Fixed C++ compiler warnings caused by implicit conversions (#39138, #39186). +* Fixed confusing dplyr warnings during tests (#39076). +* Added missing "-framework Security" pkg-config flag to prevent + issues when compiling with strict linker settings (#38861). + # arrow 14.0.0.2 ## Minor improvements and fixes From cc9e649d0382c70552e6e556199a3e238dbb7576 Mon Sep 17 00:00:00 2001 From: Judah Rand <17158624+judahrand@users.noreply.github.com> Date: Wed, 20 Dec 2023 09:34:23 +0000 Subject: [PATCH 075/570] GH-35331: [Python] Expose Parquet sorting metadata (#37665) ### Rationale for this change Picking up where #35453 left off. Closes https://github.com/apache/arrow/issues/35331 This PR builds on top of #37469 ### What changes are included in this PR? ### Are these changes tested? ### Are there any user-facing changes? * Closes: #35331 Lead-authored-by: Judah Rand <17158624+judahrand@users.noreply.github.com> Co-authored-by: Will Jones Signed-off-by: AlenkaF --- docs/source/python/api/formats.rst | 1 + python/pyarrow/_dataset_parquet.pyx | 2 + python/pyarrow/_parquet.pxd | 24 +- python/pyarrow/_parquet.pyx | 284 +++++++++++++++++- python/pyarrow/parquet/core.py | 12 +- python/pyarrow/tests/parquet/test_metadata.py | 84 ++++++ 6 files changed, 394 insertions(+), 13 deletions(-) diff --git a/docs/source/python/api/formats.rst b/docs/source/python/api/formats.rst index 9ca499c0972e5..86e2585ac2537 100644 --- a/docs/source/python/api/formats.rst +++ b/docs/source/python/api/formats.rst @@ -97,6 +97,7 @@ Parquet Metadata FileMetaData RowGroupMetaData + SortingColumn ColumnChunkMetaData Statistics ParquetSchema diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index 61e051f56cfb0..58ef6145cf7d1 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -609,6 +609,7 @@ cdef class ParquetFileWriteOptions(FileWriteOptions): dictionary_pagesize_limit=self._properties["dictionary_pagesize_limit"], write_page_index=self._properties["write_page_index"], write_page_checksum=self._properties["write_page_checksum"], + sorting_columns=self._properties["sorting_columns"], ) def _set_arrow_properties(self): @@ -659,6 +660,7 @@ cdef class ParquetFileWriteOptions(FileWriteOptions): write_page_index=False, encryption_config=None, write_page_checksum=False, + sorting_columns=None, ) self._set_properties() diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index 7ce747e0aa46d..ae4094d8b4b5f 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -328,11 +328,17 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" nogil: optional[ParquetIndexLocation] GetColumnIndexLocation() const optional[ParquetIndexLocation] GetOffsetIndexLocation() const + struct CSortingColumn" parquet::SortingColumn": + int column_idx + c_bool descending + c_bool nulls_first + cdef cppclass CRowGroupMetaData" parquet::RowGroupMetaData": c_bool Equals(const CRowGroupMetaData&) const - int num_columns() - int64_t num_rows() - int64_t total_byte_size() + int num_columns() const + int64_t num_rows() const + int64_t total_byte_size() const + vector[CSortingColumn] sorting_columns() const unique_ptr[CColumnChunkMetaData] ColumnChunk(int i) const cdef cppclass CFileMetaData" parquet::FileMetaData": @@ -421,6 +427,7 @@ cdef extern from "parquet/api/writer.h" namespace "parquet" nogil: Builder* disable_dictionary() Builder* enable_dictionary() Builder* enable_dictionary(const c_string& path) + Builder* set_sorting_columns(vector[CSortingColumn] sorting_columns) Builder* disable_statistics() Builder* enable_statistics() Builder* enable_statistics(const c_string& path) @@ -517,8 +524,8 @@ cdef extern from "parquet/arrow/schema.h" namespace "parquet::arrow" nogil: CStatus ToParquetSchema( const CSchema* arrow_schema, - const ArrowReaderProperties& properties, - const shared_ptr[const CKeyValueMetadata]& key_value_metadata, + const WriterProperties& properties, + const ArrowWriterProperties& arrow_properties, shared_ptr[SchemaDescriptor]* out) @@ -584,7 +591,9 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( write_batch_size=*, dictionary_pagesize_limit=*, write_page_index=*, - write_page_checksum=*) except * + write_page_checksum=*, + sorting_columns=*, +) except * cdef shared_ptr[ArrowWriterProperties] _create_arrow_writer_properties( @@ -593,7 +602,8 @@ cdef shared_ptr[ArrowWriterProperties] _create_arrow_writer_properties( allow_truncated_timestamps=*, writer_engine_version=*, use_compliant_nested_type=*, - store_schema=*) except * + store_schema=*, +) except * cdef class ParquetSchema(_Weakrefable): cdef: diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 35344eb735516..0b685245655a2 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -18,6 +18,7 @@ # cython: profile=False # distutils: language = c++ +from collections.abc import Sequence from textwrap import indent import warnings @@ -31,6 +32,7 @@ from pyarrow.lib cimport (_Weakrefable, Buffer, Schema, Table, NativeFile, pyarrow_wrap_chunked_array, pyarrow_wrap_schema, + pyarrow_unwrap_schema, pyarrow_wrap_table, pyarrow_wrap_batch, pyarrow_wrap_scalar, @@ -506,6 +508,204 @@ cdef class ColumnChunkMetaData(_Weakrefable): return self.metadata.GetColumnIndexLocation().has_value() +cdef class SortingColumn: + """ + Sorting specification for a single column. + + Returned by :meth:`RowGroupMetaData.sorting_columns` and used in + :class:`ParquetWriter` to specify the sort order of the data. + + Parameters + ---------- + column_index : int + Index of column that data is sorted by. + descending : bool, default False + Whether column is sorted in descending order. + nulls_first : bool, default False + Whether null values appear before valid values. + + Notes + ----- + + Column indices are zero-based, refer only to leaf fields, and are in + depth-first order. This may make the column indices for nested schemas + different from what you expect. In most cases, it will be easier to + specify the sort order using column names instead of column indices + and converting using the ``from_ordering`` method. + + Examples + -------- + + In other APIs, sort order is specified by names, such as: + + >>> sort_order = [('id', 'ascending'), ('timestamp', 'descending')] + + For Parquet, the column index must be used instead: + + >>> import pyarrow.parquet as pq + >>> [pq.SortingColumn(0), pq.SortingColumn(1, descending=True)] + [SortingColumn(column_index=0, descending=False, nulls_first=False), SortingColumn(column_index=1, descending=True, nulls_first=False)] + + Convert the sort_order into the list of sorting columns with + ``from_ordering`` (note that the schema must be provided as well): + + >>> import pyarrow as pa + >>> schema = pa.schema([('id', pa.int64()), ('timestamp', pa.timestamp('ms'))]) + >>> sorting_columns = pq.SortingColumn.from_ordering(schema, sort_order) + >>> sorting_columns + (SortingColumn(column_index=0, descending=False, nulls_first=False), SortingColumn(column_index=1, descending=True, nulls_first=False)) + + Convert back to the sort order with ``to_ordering``: + + >>> pq.SortingColumn.to_ordering(schema, sorting_columns) + ((('id', 'ascending'), ('timestamp', 'descending')), 'at_end') + + See Also + -------- + RowGroupMetaData.sorting_columns + """ + cdef int column_index + cdef c_bool descending + cdef c_bool nulls_first + + def __init__(self, int column_index, c_bool descending=False, c_bool nulls_first=False): + self.column_index = column_index + self.descending = descending + self.nulls_first = nulls_first + + @classmethod + def from_ordering(cls, Schema schema, sort_keys, null_placement='at_end'): + """ + Create a tuple of SortingColumn objects from the same arguments as + :class:`pyarrow.compute.SortOptions`. + + Parameters + ---------- + schema : Schema + Schema of the input data. + sort_keys : Sequence of (name, order) tuples + Names of field/column keys (str) to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + null_placement : {'at_start', 'at_end'}, default 'at_end' + Where null values should appear in the sort order. + + Returns + ------- + sorting_columns : tuple of SortingColumn + """ + if null_placement == 'at_start': + nulls_first = True + elif null_placement == 'at_end': + nulls_first = False + else: + raise ValueError('null_placement must be "at_start" or "at_end"') + + col_map = _name_to_index_map(schema) + + sorting_columns = [] + + for sort_key in sort_keys: + if isinstance(sort_key, str): + name = sort_key + descending = False + elif (isinstance(sort_key, tuple) and len(sort_key) == 2 and + isinstance(sort_key[0], str) and + isinstance(sort_key[1], str)): + name, descending = sort_key + if descending == "descending": + descending = True + elif descending == "ascending": + descending = False + else: + raise ValueError("Invalid sort key direction: {0}" + .format(descending)) + else: + raise ValueError("Invalid sort key: {0}".format(sort_key)) + + try: + column_index = col_map[name] + except KeyError: + raise ValueError("Sort key name '{0}' not found in schema:\n{1}" + .format(name, schema)) + + sorting_columns.append( + cls(column_index, descending=descending, nulls_first=nulls_first) + ) + + return tuple(sorting_columns) + + @staticmethod + def to_ordering(Schema schema, sorting_columns): + """ + Convert a tuple of SortingColumn objects to the same format as + :class:`pyarrow.compute.SortOptions`. + + Parameters + ---------- + schema : Schema + Schema of the input data. + sorting_columns : tuple of SortingColumn + Columns to sort the input on. + + Returns + ------- + sort_keys : tuple of (name, order) tuples + null_placement : {'at_start', 'at_end'} + """ + col_map = {i: name for name, i in _name_to_index_map(schema).items()} + + sort_keys = [] + nulls_first = None + + for sorting_column in sorting_columns: + name = col_map[sorting_column.column_index] + if sorting_column.descending: + order = "descending" + else: + order = "ascending" + sort_keys.append((name, order)) + if nulls_first is None: + nulls_first = sorting_column.nulls_first + elif nulls_first != sorting_column.nulls_first: + raise ValueError("Sorting columns have inconsistent null placement") + + if nulls_first: + null_placement = "at_start" + else: + null_placement = "at_end" + + return tuple(sort_keys), null_placement + + def __repr__(self): + return """{}(column_index={}, descending={}, nulls_first={})""".format( + self.__class__.__name__, + self.column_index, self.descending, self.nulls_first) + + def __eq__(self, SortingColumn other): + return (self.column_index == other.column_index and + self.descending == other.descending and + self.nulls_first == other.nulls_first) + + def __hash__(self): + return hash((self.column_index, self.descending, self.nulls_first)) + + @property + def column_index(self): + """"Index of column data is sorted by (int).""" + return self.column_index + + @property + def descending(self): + """Whether column is sorted in descending order (bool).""" + return self.descending + + @property + def nulls_first(self): + """Whether null values appear before valid values (bool).""" + return self.nulls_first + + cdef class RowGroupMetaData(_Weakrefable): """Metadata for a single row group.""" @@ -565,10 +765,12 @@ cdef class RowGroupMetaData(_Weakrefable): return """{0} num_columns: {1} num_rows: {2} - total_byte_size: {3}""".format(object.__repr__(self), + total_byte_size: {3} + sorting_columns: {4}""".format(object.__repr__(self), self.num_columns, self.num_rows, - self.total_byte_size) + self.total_byte_size, + self.sorting_columns) def to_dict(self): """ @@ -585,6 +787,7 @@ cdef class RowGroupMetaData(_Weakrefable): num_rows=self.num_rows, total_byte_size=self.total_byte_size, columns=columns, + sorting_columns=[col.to_dict() for col in self.sorting_columns] ) for i in range(self.num_columns): columns.append(self.column(i).to_dict()) @@ -605,6 +808,19 @@ cdef class RowGroupMetaData(_Weakrefable): """Total byte size of all the uncompressed column data in this row group (int).""" return self.metadata.total_byte_size() + @property + def sorting_columns(self): + """Columns the row group is sorted by (tuple of :class:`SortingColumn`)).""" + out = [] + cdef vector[CSortingColumn] sorting_columns = self.metadata.sorting_columns() + for sorting_col in sorting_columns: + out.append(SortingColumn( + sorting_col.column_idx, + sorting_col.descending, + sorting_col.nulls_first + )) + return tuple(out) + def _reconstruct_filemetadata(Buffer serialized): cdef: @@ -1550,6 +1766,28 @@ cdef class ParquetReader(_Weakrefable): return closed +cdef CSortingColumn _convert_sorting_column(SortingColumn sorting_column): + cdef CSortingColumn c_sorting_column + + c_sorting_column.column_idx = sorting_column.column_index + c_sorting_column.descending = sorting_column.descending + c_sorting_column.nulls_first = sorting_column.nulls_first + + return c_sorting_column + + +cdef vector[CSortingColumn] _convert_sorting_columns(sorting_columns) except *: + if not (isinstance(sorting_columns, Sequence) + and all(isinstance(col, SortingColumn) for col in sorting_columns)): + raise ValueError( + "'sorting_columns' must be a list of `SortingColumn`") + + cdef vector[CSortingColumn] c_sorting_columns = [_convert_sorting_column(col) + for col in sorting_columns] + + return c_sorting_columns + + cdef shared_ptr[WriterProperties] _create_writer_properties( use_dictionary=None, compression=None, @@ -1564,7 +1802,8 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( write_batch_size=None, dictionary_pagesize_limit=None, write_page_index=False, - write_page_checksum=False) except *: + write_page_checksum=False, + sorting_columns=None) except *: """General writer properties""" cdef: shared_ptr[WriterProperties] properties @@ -1649,6 +1888,11 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( for column in write_statistics: props.enable_statistics(tobytes(column)) + # sorting_columns + + if sorting_columns is not None: + props.set_sorting_columns(_convert_sorting_columns(sorting_columns)) + # use_byte_stream_split if isinstance(use_byte_stream_split, bool): @@ -1788,6 +2032,34 @@ cdef shared_ptr[ArrowWriterProperties] _create_arrow_writer_properties( return arrow_properties +cdef _name_to_index_map(Schema arrow_schema): + cdef: + shared_ptr[CSchema] sp_arrow_schema + shared_ptr[SchemaDescriptor] sp_parquet_schema + shared_ptr[WriterProperties] props = _create_writer_properties() + shared_ptr[ArrowWriterProperties] arrow_props = _create_arrow_writer_properties( + use_deprecated_int96_timestamps=False, + coerce_timestamps=None, + allow_truncated_timestamps=False, + writer_engine_version="V2" + ) + + sp_arrow_schema = pyarrow_unwrap_schema(arrow_schema) + + with nogil: + check_status(ToParquetSchema( + sp_arrow_schema.get(), deref(props.get()), deref(arrow_props.get()), &sp_parquet_schema)) + + out = dict() + + cdef SchemaDescriptor* parquet_schema = sp_parquet_schema.get() + + for i in range(parquet_schema.num_columns()): + name = frombytes(parquet_schema.Column(i).path().get().ToDotString()) + out[name] = i + + return out + cdef class ParquetWriter(_Weakrefable): cdef: @@ -1835,7 +2107,8 @@ cdef class ParquetWriter(_Weakrefable): dictionary_pagesize_limit=None, store_schema=True, write_page_index=False, - write_page_checksum=False): + write_page_checksum=False, + sorting_columns=None): cdef: shared_ptr[WriterProperties] properties shared_ptr[ArrowWriterProperties] arrow_properties @@ -1867,7 +2140,8 @@ cdef class ParquetWriter(_Weakrefable): write_batch_size=write_batch_size, dictionary_pagesize_limit=dictionary_pagesize_limit, write_page_index=write_page_index, - write_page_checksum=write_page_checksum + write_page_checksum=write_page_checksum, + sorting_columns=sorting_columns, ) arrow_properties = _create_arrow_writer_properties( use_deprecated_int96_timestamps=use_deprecated_int96_timestamps, diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index db22eb3293c86..852b339211b0d 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -48,7 +48,8 @@ ParquetSchema, ColumnSchema, ParquetLogicalType, FileEncryptionProperties, - FileDecryptionProperties) + FileDecryptionProperties, + SortingColumn) from pyarrow.fs import (LocalFileSystem, FileSystem, FileType, _resolve_filesystem_and_path, _ensure_filesystem) from pyarrow import filesystem as legacyfs @@ -895,6 +896,10 @@ def _sanitize_table(table, new_schema, flavor): Whether to write page checksums in general for all columns. Page checksums enable detection of data corruption, which might occur during transmission or in the storage. +sorting_columns : Sequence of SortingColumn, default None + Specify the sort order of the data being written. The writer does not sort + the data nor does it verify that the data is sorted. The sort order is + written to the row group metadata, which can then be used by readers. """ _parquet_writer_example_doc = """\ @@ -989,6 +994,7 @@ def __init__(self, where, schema, filesystem=None, store_schema=True, write_page_index=False, write_page_checksum=False, + sorting_columns=None, **options): if use_deprecated_int96_timestamps is None: # Use int96 timestamps for Spark @@ -1047,6 +1053,7 @@ def __init__(self, where, schema, filesystem=None, store_schema=store_schema, write_page_index=write_page_index, write_page_checksum=write_page_checksum, + sorting_columns=sorting_columns, **options) self.is_open = True @@ -3129,6 +3136,7 @@ def write_table(table, where, row_group_size=None, version='2.6', store_schema=True, write_page_index=False, write_page_checksum=False, + sorting_columns=None, **kwargs): # Implementor's note: when adding keywords here / updating defaults, also # update it in write_to_dataset and _dataset_parquet.pyx ParquetFileWriteOptions @@ -3158,6 +3166,7 @@ def write_table(table, where, row_group_size=None, version='2.6', store_schema=store_schema, write_page_index=write_page_index, write_page_checksum=write_page_checksum, + sorting_columns=sorting_columns, **kwargs) as writer: writer.write_table(table, row_group_size=row_group_size) except Exception: @@ -3742,6 +3751,7 @@ def read_schema(where, memory_map=False, decryption_properties=None, "ParquetWriter", "PartitionSet", "RowGroupMetaData", + "SortingColumn", "Statistics", "read_metadata", "read_pandas", diff --git a/python/pyarrow/tests/parquet/test_metadata.py b/python/pyarrow/tests/parquet/test_metadata.py index 3efaf1dbf5526..73284d2e53b9e 100644 --- a/python/pyarrow/tests/parquet/test_metadata.py +++ b/python/pyarrow/tests/parquet/test_metadata.py @@ -301,6 +301,90 @@ def test_parquet_write_disable_statistics(tempdir): assert cc_b.statistics is None +def test_parquet_sorting_column(): + sorting_col = pq.SortingColumn(10) + assert sorting_col.column_index == 10 + assert sorting_col.descending is False + assert sorting_col.nulls_first is False + + sorting_col = pq.SortingColumn(0, descending=True, nulls_first=True) + assert sorting_col.column_index == 0 + assert sorting_col.descending is True + assert sorting_col.nulls_first is True + + schema = pa.schema([('a', pa.int64()), ('b', pa.int64())]) + sorting_cols = ( + pq.SortingColumn(1, descending=True), + pq.SortingColumn(0, descending=False), + ) + sort_order, null_placement = pq.SortingColumn.to_ordering(schema, sorting_cols) + assert sort_order == (('b', "descending"), ('a', "ascending")) + assert null_placement == "at_end" + + sorting_cols_roundtripped = pq.SortingColumn.from_ordering( + schema, sort_order, null_placement) + assert sorting_cols_roundtripped == sorting_cols + + sorting_cols = pq.SortingColumn.from_ordering( + schema, ('a', ('b', "descending")), null_placement="at_start") + expected = ( + pq.SortingColumn(0, descending=False, nulls_first=True), + pq.SortingColumn(1, descending=True, nulls_first=True), + ) + assert sorting_cols == expected + + # Conversions handle empty tuples + empty_sorting_cols = pq.SortingColumn.from_ordering(schema, ()) + assert empty_sorting_cols == () + + assert pq.SortingColumn.to_ordering(schema, ()) == ((), "at_end") + + with pytest.raises(ValueError): + pq.SortingColumn.from_ordering(schema, (("a", "not a valid sort order"))) + + with pytest.raises(ValueError, match="inconsistent null placement"): + sorting_cols = ( + pq.SortingColumn(1, nulls_first=True), + pq.SortingColumn(0, nulls_first=False), + ) + pq.SortingColumn.to_ordering(schema, sorting_cols) + + +def test_parquet_sorting_column_nested(): + schema = pa.schema({ + 'a': pa.struct([('x', pa.int64()), ('y', pa.int64())]), + 'b': pa.int64() + }) + + sorting_columns = [ + pq.SortingColumn(0, descending=True), # a.x + pq.SortingColumn(2, descending=False) # b + ] + + sort_order, null_placement = pq.SortingColumn.to_ordering(schema, sorting_columns) + assert null_placement == "at_end" + assert len(sort_order) == 2 + assert sort_order[0] == ("a.x", "descending") + assert sort_order[1] == ("b", "ascending") + + +def test_parquet_file_sorting_columns(): + table = pa.table({'a': [1, 2, 3], 'b': ['a', 'b', 'c']}) + + sorting_columns = ( + pq.SortingColumn(column_index=0, descending=True, nulls_first=True), + pq.SortingColumn(column_index=1, descending=False), + ) + writer = pa.BufferOutputStream() + _write_table(table, writer, sorting_columns=sorting_columns) + reader = pa.BufferReader(writer.getvalue()) + + # Can retrieve sorting columns from metadata + metadata = pq.read_metadata(reader) + assert metadata.num_row_groups == 1 + assert sorting_columns == metadata.row_group(0).sorting_columns + + def test_field_id_metadata(): # ARROW-7080 field_id = b'PARQUET:field_id' From b1fcba1b395e0aedddcdab19958c14809d780d4c Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 20 Dec 2023 11:06:57 +0100 Subject: [PATCH 076/570] MINOR: [Docs] local_timestamp kernel docs are not linked in python docs (#39274) ### Rationale for this change local_timestamp kernel docs are linked in [cpp](https://arrow.apache.org/docs/cpp/compute.html#timezone-handling) but not in [python docs](https://arrow.apache.org/docs/python/api/compute.html#timezone-handling). ### What changes are included in this PR? This adds a rst link in python docs ### Are these changes tested? No ### Are there any user-facing changes? Change will be visible in the docs Authored-by: Rok Mihevc Signed-off-by: Joris Van den Bossche --- docs/source/python/api/compute.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index 4ee364fcf636b..b879643017a90 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -468,6 +468,7 @@ Timezone Handling :toctree: ../generated/ assume_timezone + local_timestamp Associative Transforms ---------------------- From 87865b5a85c722ef7578aed4300e9d0b219c909c Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 20 Dec 2023 12:07:06 +0100 Subject: [PATCH 077/570] GH-39306: [C++][Benchmarking] Remove hardcoded min times (#39307) ### Rationale for this change `MinTime` settings hardcoded in the C++ source code prevent the `--benchmark_min_time` CLI option from working. ### Are these changes tested? No. ### Are there any user-facing changes? No. * Closes: #39306 Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- .../kernels/vector_partition_benchmark.cc | 1 - .../compute/kernels/vector_topk_benchmark.cc | 1 - cpp/src/gandiva/tests/micro_benchmarks.cc | 38 +++++++++---------- 3 files changed, 19 insertions(+), 21 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/vector_partition_benchmark.cc b/cpp/src/arrow/compute/kernels/vector_partition_benchmark.cc index ff009c65543a6..f21dd8317e493 100644 --- a/cpp/src/arrow/compute/kernels/vector_partition_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/vector_partition_benchmark.cc @@ -52,7 +52,6 @@ BENCHMARK(NthToIndicesInt64) ->Apply(RegressionSetArgs) ->Args({1 << 20, 100}) ->Args({1 << 23, 100}) - ->MinTime(1.0) ->Unit(benchmark::TimeUnit::kNanosecond); } // namespace compute diff --git a/cpp/src/arrow/compute/kernels/vector_topk_benchmark.cc b/cpp/src/arrow/compute/kernels/vector_topk_benchmark.cc index 3f89eb6bea9cd..e95e7a6f02a04 100644 --- a/cpp/src/arrow/compute/kernels/vector_topk_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/vector_topk_benchmark.cc @@ -52,7 +52,6 @@ BENCHMARK(SelectKInt64) ->Apply(RegressionSetArgs) ->Args({1 << 20, 100}) ->Args({1 << 23, 100}) - ->MinTime(1.0) ->Unit(benchmark::TimeUnit::kNanosecond); } // namespace compute diff --git a/cpp/src/gandiva/tests/micro_benchmarks.cc b/cpp/src/gandiva/tests/micro_benchmarks.cc index ed77f8ae5045b..f126b769b2010 100644 --- a/cpp/src/gandiva/tests/micro_benchmarks.cc +++ b/cpp/src/gandiva/tests/micro_benchmarks.cc @@ -460,24 +460,24 @@ static void DecimalAdd3Large(benchmark::State& state) { DoDecimalAdd3(state, DecimalTypeUtil::kMaxPrecision, 18, true); } -BENCHMARK(TimedTestAdd3)->MinTime(1.0)->Unit(benchmark::kMicrosecond); -BENCHMARK(TimedTestBigNested)->MinTime(1.0)->Unit(benchmark::kMicrosecond); -BENCHMARK(TimedTestExtractYear)->MinTime(1.0)->Unit(benchmark::kMicrosecond); -BENCHMARK(TimedTestFilterAdd2)->MinTime(1.0)->Unit(benchmark::kMicrosecond); -BENCHMARK(TimedTestFilterLike)->MinTime(1.0)->Unit(benchmark::kMicrosecond); -BENCHMARK(TimedTestCastFloatFromString)->MinTime(1.0)->Unit(benchmark::kMicrosecond); -BENCHMARK(TimedTestCastIntFromString)->MinTime(1.0)->Unit(benchmark::kMicrosecond); -BENCHMARK(TimedTestAllocs)->MinTime(1.0)->Unit(benchmark::kMicrosecond); -BENCHMARK(TimedTestOutputStringAllocs)->MinTime(1.0)->Unit(benchmark::kMicrosecond); -BENCHMARK(TimedTestMultiOr)->MinTime(1.0)->Unit(benchmark::kMicrosecond); -BENCHMARK(TimedTestInExpr)->MinTime(1.0)->Unit(benchmark::kMicrosecond); -BENCHMARK(DecimalAdd2Fast)->MinTime(1.0)->Unit(benchmark::kMicrosecond); -BENCHMARK(DecimalAdd2LeadingZeroes)->MinTime(1.0)->Unit(benchmark::kMicrosecond); -BENCHMARK(DecimalAdd2LeadingZeroesWithDiv)->MinTime(1.0)->Unit(benchmark::kMicrosecond); -BENCHMARK(DecimalAdd2Large)->MinTime(1.0)->Unit(benchmark::kMicrosecond); -BENCHMARK(DecimalAdd3Fast)->MinTime(1.0)->Unit(benchmark::kMicrosecond); -BENCHMARK(DecimalAdd3LeadingZeroes)->MinTime(1.0)->Unit(benchmark::kMicrosecond); -BENCHMARK(DecimalAdd3LeadingZeroesWithDiv)->MinTime(1.0)->Unit(benchmark::kMicrosecond); -BENCHMARK(DecimalAdd3Large)->MinTime(1.0)->Unit(benchmark::kMicrosecond); +BENCHMARK(TimedTestAdd3)->Unit(benchmark::kMicrosecond); +BENCHMARK(TimedTestBigNested)->Unit(benchmark::kMicrosecond); +BENCHMARK(TimedTestExtractYear)->Unit(benchmark::kMicrosecond); +BENCHMARK(TimedTestFilterAdd2)->Unit(benchmark::kMicrosecond); +BENCHMARK(TimedTestFilterLike)->Unit(benchmark::kMicrosecond); +BENCHMARK(TimedTestCastFloatFromString)->Unit(benchmark::kMicrosecond); +BENCHMARK(TimedTestCastIntFromString)->Unit(benchmark::kMicrosecond); +BENCHMARK(TimedTestAllocs)->Unit(benchmark::kMicrosecond); +BENCHMARK(TimedTestOutputStringAllocs)->Unit(benchmark::kMicrosecond); +BENCHMARK(TimedTestMultiOr)->Unit(benchmark::kMicrosecond); +BENCHMARK(TimedTestInExpr)->Unit(benchmark::kMicrosecond); +BENCHMARK(DecimalAdd2Fast)->Unit(benchmark::kMicrosecond); +BENCHMARK(DecimalAdd2LeadingZeroes)->Unit(benchmark::kMicrosecond); +BENCHMARK(DecimalAdd2LeadingZeroesWithDiv)->Unit(benchmark::kMicrosecond); +BENCHMARK(DecimalAdd2Large)->Unit(benchmark::kMicrosecond); +BENCHMARK(DecimalAdd3Fast)->Unit(benchmark::kMicrosecond); +BENCHMARK(DecimalAdd3LeadingZeroes)->Unit(benchmark::kMicrosecond); +BENCHMARK(DecimalAdd3LeadingZeroesWithDiv)->Unit(benchmark::kMicrosecond); +BENCHMARK(DecimalAdd3Large)->Unit(benchmark::kMicrosecond); } // namespace gandiva From 726568936e345ee2a15d3a5ad5d654e14939d673 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 20 Dec 2023 10:42:46 -0300 Subject: [PATCH 078/570] GH-39297: [C++][FS]: Inform caller of container not-existing when checking for HNS support (#39298) ### Rationale for this change An operation checking for Hierarchical Namespace support shouldn't fail completely when the reason for the check failing is the container not existing. We can allow the caller to decide what to do in that situation by returning a result that indicates the check didn't succeed because the container doesn't exist. ### What changes are included in this PR? - Removal of the `azurefs_intern.h/cc` files - Implementation of the check as a free-function instead of a class - Memoization of the result in the `AzureFileSystem` class ### Are these changes tested? Yes. The tests were improved to cover all cases. * Closes: #39297 Authored-by: Felipe Oliveira Carvalho Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/CMakeLists.txt | 4 +- cpp/src/arrow/filesystem/azurefs.cc | 320 ++++++++++++------- cpp/src/arrow/filesystem/azurefs.h | 51 ++- cpp/src/arrow/filesystem/azurefs_internal.cc | 94 ------ cpp/src/arrow/filesystem/azurefs_internal.h | 39 --- cpp/src/arrow/filesystem/azurefs_test.cc | 64 +++- 6 files changed, 309 insertions(+), 263 deletions(-) delete mode 100644 cpp/src/arrow/filesystem/azurefs_internal.cc delete mode 100644 cpp/src/arrow/filesystem/azurefs_internal.h diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 00947c6275678..c1fafeebc035d 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -506,8 +506,8 @@ if(ARROW_FILESYSTEM) filesystem/util_internal.cc) if(ARROW_AZURE) - list(APPEND ARROW_SRCS filesystem/azurefs.cc filesystem/azurefs_internal.cc) - set_source_files_properties(filesystem/azurefs.cc filesystem/azurefs_internal.cc + list(APPEND ARROW_SRCS filesystem/azurefs.cc) + set_source_files_properties(filesystem/azurefs.cc PROPERTIES SKIP_PRECOMPILE_HEADERS ON SKIP_UNITY_BUILD_INCLUSION ON) endif() diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 1aa3e86a6f926..032cd034e7abb 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -16,7 +16,6 @@ // under the License. #include "arrow/filesystem/azurefs.h" -#include "arrow/filesystem/azurefs_internal.h" #include #include @@ -42,6 +41,8 @@ namespace DataLake = Azure::Storage::Files::DataLake; namespace Http = Azure::Core::Http; namespace Storage = Azure::Storage; +using internal::HNSSupport; + // ----------------------------------------------------------------------- // AzureOptions Implementation @@ -263,9 +264,11 @@ Status StatusFromErrorResponse(const std::string& url, "): ", body_text); } -bool IsContainerNotFound(const Storage::StorageException& exception) { - if (exception.ErrorCode == "ContainerNotFound") { - DCHECK_EQ(exception.StatusCode, Http::HttpStatusCode::NotFound); +bool IsContainerNotFound(const Storage::StorageException& e) { + if (e.ErrorCode == "ContainerNotFound" || + e.ReasonPhrase == "The specified container does not exist." || + e.ReasonPhrase == "The specified filesystem does not exist.") { + DCHECK_EQ(e.StatusCode, Http::HttpStatusCode::NotFound); return true; } return false; @@ -441,8 +444,7 @@ class ObjectInputFile final : public io::RandomAccessFile { } return ExceptionToStatus( "GetProperties failed for '" + blob_client_->GetUrl() + - "' with an unexpected Azure error. Cannot initialise an ObjectInputFile " - "without knowing the file size.", + "'. Cannot initialise an ObjectInputFile without knowing the file size.", exception); } } @@ -520,12 +522,11 @@ class ObjectInputFile final : public io::RandomAccessFile { ->DownloadTo(reinterpret_cast(out), nbytes, download_options) .Value.ContentRange.Length.Value(); } catch (const Storage::StorageException& exception) { - return ExceptionToStatus("DownloadTo from '" + blob_client_->GetUrl() + - "' at position " + std::to_string(position) + " for " + - std::to_string(nbytes) + - " bytes failed with an Azure error. ReadAt " - "failed to read the required byte range.", - exception); + return ExceptionToStatus( + "DownloadTo from '" + blob_client_->GetUrl() + "' at position " + + std::to_string(position) + " for " + std::to_string(nbytes) + + " bytes failed. ReadAt failed to read the required byte range.", + exception); } } @@ -576,9 +577,8 @@ Status CreateEmptyBlockBlob(std::shared_ptr block_blob_c } catch (const Storage::StorageException& exception) { return ExceptionToStatus( "UploadFrom failed for '" + block_blob_client->GetUrl() + - "' with an unexpected Azure error. There is no existing blob at this " - "location or the existing blob must be replaced so ObjectAppendStream must " - "create a new empty block blob.", + "'. There is no existing blob at this location or the existing blob must be " + "replaced so ObjectAppendStream must create a new empty block blob.", exception); } return Status::OK(); @@ -591,8 +591,7 @@ Result GetBlockList( } catch (Storage::StorageException& exception) { return ExceptionToStatus( "GetBlockList failed for '" + block_blob_client->GetUrl() + - "' with an unexpected Azure error. Cannot write to a file without first " - "fetching the existing block list.", + "'. Cannot write to a file without first fetching the existing block list.", exception); } } @@ -620,8 +619,7 @@ Status CommitBlockList(std::shared_ptr block_bl } catch (const Storage::StorageException& exception) { return ExceptionToStatus( "CommitBlockList failed for '" + block_blob_client->GetUrl() + - "' with an unexpected Azure error. Committing is required to flush an " - "output/append stream.", + "'. Committing is required to flush an output/append stream.", exception); } return Status::OK(); @@ -665,9 +663,8 @@ class ObjectAppendStream final : public io::OutputStream { } else { return ExceptionToStatus( "GetProperties failed for '" + block_blob_client_->GetUrl() + - "' with an unexpected Azure error. Cannot initialise an " - "ObjectAppendStream without knowing whether a file already exists at " - "this path, and if it exists, its size.", + "'. Cannot initialise an ObjectAppendStream without knowing whether a " + "file already exists at this path, and if it exists, its size.", exception); } content_length_ = 0; @@ -765,8 +762,7 @@ class ObjectAppendStream final : public io::OutputStream { return ExceptionToStatus( "StageBlock failed for '" + block_blob_client_->GetUrl() + "' new_block_id: '" + new_block_id + - "' with an unexpected Azure error. Staging new blocks is fundamental to " - "streaming writes to blob storage.", + "'. Staging new blocks is fundamental to streaming writes to blob storage.", exception); } block_ids_.push_back(new_block_id); @@ -786,11 +782,116 @@ class ObjectAppendStream final : public io::OutputStream { Storage::Metadata metadata_; }; +bool IsDfsEmulator(const AzureOptions& options) { + return options.dfs_storage_authority != ".dfs.core.windows.net"; +} + } // namespace +// ----------------------------------------------------------------------- +// internal implementation + +namespace internal { + +Result CheckIfHierarchicalNamespaceIsEnabled( + DataLake::DataLakeFileSystemClient& adlfs_client, const AzureOptions& options) { + try { + auto directory_client = adlfs_client.GetDirectoryClient(""); + // GetAccessControlList will fail on storage accounts + // without hierarchical namespace enabled. + directory_client.GetAccessControlList(); + return HNSSupport::kEnabled; + } catch (std::out_of_range& exception) { + // Azurite issue detected. + DCHECK(IsDfsEmulator(options)); + return HNSSupport::kDisabled; + } catch (const Storage::StorageException& exception) { + // Flat namespace storage accounts with "soft delete" enabled return + // + // "Conflict - This endpoint does not support BlobStorageEvents + // or SoftDelete. [...]" [1], + // + // otherwise it returns: + // + // "BadRequest - This operation is only supported on a hierarchical namespace + // account." + // + // [1]: + // https://learn.microsoft.com/en-us/answers/questions/1069779/this-endpoint-does-not-support-blobstorageevents-o + switch (exception.StatusCode) { + case Http::HttpStatusCode::BadRequest: + case Http::HttpStatusCode::Conflict: + return HNSSupport::kDisabled; + case Http::HttpStatusCode::NotFound: + if (IsDfsEmulator(options)) { + return HNSSupport::kDisabled; + } + // Did we get an error because of the container not existing? + if (IsContainerNotFound(exception)) { + return HNSSupport::kContainerNotFound; + } + [[fallthrough]]; + default: + if (exception.ErrorCode == "HierarchicalNamespaceNotEnabled") { + return HNSSupport::kDisabled; + } + return ExceptionToStatus("Check for Hierarchical Namespace support on '" + + adlfs_client.GetUrl() + "' failed.", + exception); + } + } +} + +} // namespace internal + // ----------------------------------------------------------------------- // AzureFilesystem Implementation +namespace { + +// In Azure Storage terminology, a "container" and a "filesystem" are the same +// kind of object, but it can be accessed using different APIs. The Blob Storage +// API calls it a "container", the Data Lake Storage Gen 2 API calls it a +// "filesystem". Creating a container using the Blob Storage API will make it +// accessible using the Data Lake Storage Gen 2 API and vice versa. + +template +Result GetContainerPropsAsFileInfo(const std::string& container_name, + ContainerClient& container_client) { + FileInfo info{container_name}; + try { + auto properties = container_client.GetProperties(); + info.set_type(FileType::Directory); + info.set_mtime(std::chrono::system_clock::time_point{properties.Value.LastModified}); + return info; + } catch (const Storage::StorageException& exception) { + if (IsContainerNotFound(exception)) { + info.set_type(FileType::NotFound); + return info; + } + return ExceptionToStatus( + "GetProperties for '" + container_client.GetUrl() + "' failed.", exception); + } +} + +FileInfo DirectoryFileInfoFromPath(std::string_view path) { + return FileInfo{std::string{internal::RemoveTrailingSlash(path)}, FileType::Directory}; +} + +FileInfo FileInfoFromBlob(std::string_view container, + const Blobs::Models::BlobItem& blob) { + auto path = internal::ConcatAbstractPath(container, blob.Name); + if (internal::HasTrailingSlash(blob.Name)) { + return DirectoryFileInfoFromPath(path); + } + FileInfo info{std::move(path), FileType::File}; + info.set_size(blob.BlobSize); + info.set_mtime(std::chrono::system_clock::time_point{blob.Details.LastModified}); + return info; +} + +} // namespace + class AzureFileSystem::Impl { private: io::IOContext io_context_; @@ -798,7 +899,7 @@ class AzureFileSystem::Impl { std::unique_ptr datalake_service_client_; std::unique_ptr blob_service_client_; - internal::HierarchicalNamespaceDetector hns_detector_; + HNSSupport cached_hns_support_ = HNSSupport::kUnknown; Impl(AzureOptions options, io::IOContext io_context) : io_context_(std::move(io_context)), options_(std::move(options)) {} @@ -812,52 +913,54 @@ class AzureFileSystem::Impl { self->options_.MakeBlobServiceClient()); ARROW_ASSIGN_OR_RAISE(self->datalake_service_client_, self->options_.MakeDataLakeServiceClient()); - RETURN_NOT_OK(self->hns_detector_.Init(self->datalake_service_client_.get())); return self; } io::IOContext& io_context() { return io_context_; } const AzureOptions& options() const { return options_; } + private: + /// \brief Memoized version of CheckIfHierarchicalNamespaceIsEnabled. + /// + /// \return kEnabled/kDisabled/kContainerNotFound (kUnknown is never returned). + Result HierarchicalNamespaceSupport( + DataLake::DataLakeFileSystemClient& adlfs_client) { + switch (cached_hns_support_) { + case HNSSupport::kEnabled: + case HNSSupport::kDisabled: + return cached_hns_support_; + case HNSSupport::kUnknown: + case HNSSupport::kContainerNotFound: + // Try the check again because the support is still unknown or the container + // that didn't exist before may exist now. + break; + } + ARROW_ASSIGN_OR_RAISE( + cached_hns_support_, + internal::CheckIfHierarchicalNamespaceIsEnabled(adlfs_client, options_)); + DCHECK_NE(cached_hns_support_, HNSSupport::kUnknown); + // Caller should handle kContainerNotFound case appropriately. + return cached_hns_support_; + } + public: Result GetFileInfo(const AzureLocation& location) { - FileInfo info; - info.set_path(location.all); - if (location.container.empty()) { - // The location is invalid if the container is empty but the path is not. DCHECK(location.path.empty()); - // This location must be derived from the root path. FileInfo should describe it - // as a directory and there isn't any extra metadata to fetch. - info.set_type(FileType::Directory); - return info; + // Root directory of the storage account. + return FileInfo{"", FileType::Directory}; } if (location.path.empty()) { - // The location refers to a container. This is a directory if it exists. + // We have a container, but no path within the container. + // The container itself represents a directory. auto container_client = blob_service_client_->GetBlobContainerClient(location.container); - try { - auto properties = container_client.GetProperties(); - info.set_type(FileType::Directory); - info.set_mtime( - std::chrono::system_clock::time_point{properties.Value.LastModified}); - return info; - } catch (const Storage::StorageException& exception) { - if (IsContainerNotFound(exception)) { - info.set_type(FileType::NotFound); - return info; - } - return ExceptionToStatus( - "GetProperties for '" + container_client.GetUrl() + - "' failed with an unexpected Azure error. GetFileInfo is unable to " - "determine whether the container exists.", - exception); - } + return GetContainerPropsAsFileInfo(location.container, container_client); } - // There is a path to search within the container. - auto file_client = datalake_service_client_->GetFileSystemClient(location.container) - .GetFileClient(location.path); + FileInfo info{location.all}; + auto adlfs_client = datalake_service_client_->GetFileSystemClient(location.container); + auto file_client = adlfs_client.GetFileClient(location.path); try { auto properties = file_client.GetProperties(); if (properties.Value.IsDirectory) { @@ -879,11 +982,12 @@ class AzureFileSystem::Impl { return info; } catch (const Storage::StorageException& exception) { if (exception.StatusCode == Http::HttpStatusCode::NotFound) { - ARROW_ASSIGN_OR_RAISE(auto hierarchical_namespace_enabled, - hns_detector_.Enabled(location.container)); - if (hierarchical_namespace_enabled) { - // If the hierarchical namespace is enabled, then the storage account will have - // explicit directories. Neither a file nor a directory was found. + ARROW_ASSIGN_OR_RAISE(auto hns_support, + HierarchicalNamespaceSupport(adlfs_client)); + if (hns_support == HNSSupport::kContainerNotFound || + hns_support == HNSSupport::kEnabled) { + // If the hierarchical namespace is enabled, then the storage account will + // have explicit directories. Neither a file nor a directory was found. info.set_type(FileType::NotFound); return info; } @@ -907,16 +1011,15 @@ class AzureFileSystem::Impl { return info; } catch (const Storage::StorageException& exception) { return ExceptionToStatus( - "ListBlobs for '" + *list_blob_options.Prefix + - "' failed with an unexpected Azure error. GetFileInfo is unable to " - "determine whether the path should be considered an implied directory.", + "ListBlobs failed for prefix='" + *list_blob_options.Prefix + + "' failed. GetFileInfo is unable to determine whether the path should " + "be considered an implied directory.", exception); } } return ExceptionToStatus( - "GetProperties for '" + file_client.GetUrl() + - "' failed with an unexpected " - "Azure error. GetFileInfo is unable to determine whether the path exists.", + "GetProperties failed for '" + file_client.GetUrl() + + "' GetFileInfo is unable to determine whether the path exists.", exception); } } @@ -940,23 +1043,6 @@ class AzureFileSystem::Impl { return Status::OK(); } - static FileInfo FileInfoFromBlob(std::string_view container, - const Blobs::Models::BlobItem& blob) { - auto path = internal::ConcatAbstractPath(container, blob.Name); - if (internal::HasTrailingSlash(blob.Name)) { - return DirectoryFileInfoFromPath(path); - } - FileInfo info{std::move(path), FileType::File}; - info.set_size(blob.BlobSize); - info.set_mtime(std::chrono::system_clock::time_point{blob.Details.LastModified}); - return info; - } - - static FileInfo DirectoryFileInfoFromPath(std::string_view path) { - return FileInfo{std::string{internal::RemoveTrailingSlash(path)}, - FileType::Directory}; - } - static std::string_view BasenameView(std::string_view s) { DCHECK(!internal::HasTrailingSlash(s)); auto offset = s.find_last_of(internal::kSep); @@ -1158,9 +1244,9 @@ class AzureFileSystem::Impl { return Status::Invalid("Cannot create an empty container"); } + auto container_client = + blob_service_client_->GetBlobContainerClient(location.container); if (location.path.empty()) { - auto container_client = - blob_service_client_->GetBlobContainerClient(location.container); try { auto response = container_client.Create(); if (response.Value.Created) { @@ -1177,18 +1263,25 @@ class AzureFileSystem::Impl { } } - ARROW_ASSIGN_OR_RAISE(auto hierarchical_namespace_enabled, - hns_detector_.Enabled(location.container)); - if (!hierarchical_namespace_enabled) { + auto adlfs_client = datalake_service_client_->GetFileSystemClient(location.container); + ARROW_ASSIGN_OR_RAISE(auto hns_support, HierarchicalNamespaceSupport(adlfs_client)); + if (hns_support == HNSSupport::kContainerNotFound) { + return PathNotFound(location); + } + if (hns_support == HNSSupport::kDisabled) { + ARROW_ASSIGN_OR_RAISE( + auto container_info, + GetContainerPropsAsFileInfo(location.container, container_client)); + if (container_info.type() == FileType::NotFound) { + return PathNotFound(location); + } // Without hierarchical namespace enabled Azure blob storage has no directories. // Therefore we can't, and don't need to create one. Simply creating a blob with `/` // in the name implies directories. return Status::OK(); } - auto directory_client = - datalake_service_client_->GetFileSystemClient(location.container) - .GetDirectoryClient(location.path); + auto directory_client = adlfs_client.GetDirectoryClient(location.path); try { auto response = directory_client.Create(); if (response.Value.Created) { @@ -1219,19 +1312,19 @@ class AzureFileSystem::Impl { exception); } - ARROW_ASSIGN_OR_RAISE(auto hierarchical_namespace_enabled, - hns_detector_.Enabled(location.container)); - if (!hierarchical_namespace_enabled) { + auto adlfs_client = datalake_service_client_->GetFileSystemClient(location.container); + ARROW_ASSIGN_OR_RAISE(auto hns_support, HierarchicalNamespaceSupport(adlfs_client)); + if (hns_support == HNSSupport::kDisabled) { // Without hierarchical namespace enabled Azure blob storage has no directories. // Therefore we can't, and don't need to create one. Simply creating a blob with `/` // in the name implies directories. return Status::OK(); } + // Don't handle HNSSupport::kContainerNotFound, just assume it still exists (because + // it was created above) and try to create the directory. if (!location.path.empty()) { - auto directory_client = - datalake_service_client_->GetFileSystemClient(location.container) - .GetDirectoryClient(location.path); + auto directory_client = adlfs_client.GetDirectoryClient(location.path); try { directory_client.CreateIfNotExists(); } catch (const Storage::StorageException& exception) { @@ -1344,6 +1437,12 @@ class AzureFileSystem::Impl { return Status::Invalid("Cannot delete an empty container"); } + auto adlfs_client = datalake_service_client_->GetFileSystemClient(location.container); + ARROW_ASSIGN_OR_RAISE(auto hns_support, HierarchicalNamespaceSupport(adlfs_client)); + if (hns_support == HNSSupport::kContainerNotFound) { + return PathNotFound(location); + } + if (location.path.empty()) { auto container_client = blob_service_client_->GetBlobContainerClient(location.container); @@ -1363,12 +1462,8 @@ class AzureFileSystem::Impl { } } - ARROW_ASSIGN_OR_RAISE(auto hierarchical_namespace_enabled, - hns_detector_.Enabled(location.container)); - if (hierarchical_namespace_enabled) { - auto directory_client = - datalake_service_client_->GetFileSystemClient(location.container) - .GetDirectoryClient(location.path); + if (hns_support == HNSSupport::kEnabled) { + auto directory_client = adlfs_client.GetDirectoryClient(location.path); try { auto response = directory_client.DeleteRecursive(); if (response.Value.Deleted) { @@ -1394,19 +1489,20 @@ class AzureFileSystem::Impl { return internal::InvalidDeleteDirContents(location.all); } - ARROW_ASSIGN_OR_RAISE(auto hierarchical_namespace_enabled, - hns_detector_.Enabled(location.container)); - if (hierarchical_namespace_enabled) { - auto file_system_client = - datalake_service_client_->GetFileSystemClient(location.container); - auto directory_client = file_system_client.GetDirectoryClient(location.path); + auto adlfs_client = datalake_service_client_->GetFileSystemClient(location.container); + ARROW_ASSIGN_OR_RAISE(auto hns_support, HierarchicalNamespaceSupport(adlfs_client)); + if (hns_support == HNSSupport::kContainerNotFound) { + return missing_dir_ok ? Status::OK() : PathNotFound(location); + } + + if (hns_support == HNSSupport::kEnabled) { + auto directory_client = adlfs_client.GetDirectoryClient(location.path); try { auto list_response = directory_client.ListPaths(false); for (; list_response.HasPage(); list_response.MoveToNextPage()) { for (const auto& path : list_response.Paths) { if (path.IsDirectory) { - auto sub_directory_client = - file_system_client.GetDirectoryClient(path.Name); + auto sub_directory_client = adlfs_client.GetDirectoryClient(path.Name); try { sub_directory_client.DeleteRecursive(); } catch (const Storage::StorageException& exception) { @@ -1416,7 +1512,7 @@ class AzureFileSystem::Impl { exception); } } else { - auto sub_file_client = file_system_client.GetFileClient(path.Name); + auto sub_file_client = adlfs_client.GetFileClient(path.Name); try { sub_file_client.Delete(); } catch (const Storage::StorageException& exception) { diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h index 35c140b1097c7..b7ef2bb3130c2 100644 --- a/cpp/src/arrow/filesystem/azurefs.h +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -38,8 +38,9 @@ class BlobServiceClient; } namespace Azure::Storage::Files::DataLake { +class DataLakeFileSystemClient; class DataLakeServiceClient; -} +} // namespace Azure::Storage::Files::DataLake namespace arrow::fs { @@ -117,6 +118,54 @@ struct ARROW_EXPORT AzureOptions { MakeDataLakeServiceClient() const; }; +namespace internal { + +enum class HNSSupport { + kUnknown = 0, + kContainerNotFound = 1, + kDisabled = 2, + kEnabled = 3, +}; + +/// \brief Performs a request to check if the storage account has Hierarchical +/// Namespace support enabled. +/// +/// This check requires a DataLakeFileSystemClient for any container of the +/// storage account. If the container doesn't exist yet, we just forward that +/// error to the caller (kContainerNotFound) since that's a proper error to the operation +/// on that container anyways -- no need to try again with or without the knowledge of +/// Hierarchical Namespace support. +/// +/// Hierarchical Namespace support can't easily be changed after the storage account is +/// created and the feature is shared by all containers in the storage account. +/// This means the result of this check can (and should!) be cached as soon as +/// it returns a successful result on any container of the storage account (see +/// AzureFileSystem::Impl). +/// +/// The check consists of a call to DataLakeFileSystemClient::GetAccessControlList() +/// on the root directory of the container. An approach taken by the Hadoop Azure +/// project [1]. A more obvious approach would be to call +/// BlobServiceClient::GetAccountInfo(), but that endpoint requires elevated +/// permissions [2] that we can't generally rely on. +/// +/// [1]: +/// https://github.com/apache/hadoop/blob/7c6af6a5f626d18d68b656d085cc23e4c1f7a1ef/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/AzureBlobFileSystemStore.java#L356. +/// [2]: +/// https://learn.microsoft.com/en-us/rest/api/storageservices/get-blob-service-properties?tabs=azure-ad#authorization +/// +/// IMPORTANT: If the result is kEnabled or kDisabled, it doesn't necessarily mean that +/// the container exists. +/// +/// \param adlfs_client A DataLakeFileSystemClient for a container of the storage +/// account. +/// \return kEnabled/kDisabled/kContainerNotFound (kUnknown is never +/// returned). +Result CheckIfHierarchicalNamespaceIsEnabled( + Azure::Storage::Files::DataLake::DataLakeFileSystemClient& adlfs_client, + const AzureOptions& options); + +} // namespace internal + /// \brief FileSystem implementation backed by Azure Blob Storage (ABS) [1] and /// Azure Data Lake Storage Gen2 (ADLS Gen2) [2]. /// diff --git a/cpp/src/arrow/filesystem/azurefs_internal.cc b/cpp/src/arrow/filesystem/azurefs_internal.cc deleted file mode 100644 index 39c3fb23e3cfd..0000000000000 --- a/cpp/src/arrow/filesystem/azurefs_internal.cc +++ /dev/null @@ -1,94 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/filesystem/azurefs_internal.h" - -#include - -#include "arrow/result.h" - -namespace arrow::fs::internal { - -namespace { - -// TODO(GH-38772): Remove azurefs_internal.h/.cc by moving the detector to -// azurefs.cc (which contains a private copy of this helper function already). -Status ExceptionToStatus(const std::string& prefix, - const Azure::Storage::StorageException& exception) { - return Status::IOError(prefix, " Azure Error: ", exception.what()); -} - -} // namespace - -Status HierarchicalNamespaceDetector::Init( - Azure::Storage::Files::DataLake::DataLakeServiceClient* datalake_service_client) { - datalake_service_client_ = datalake_service_client; - return Status::OK(); -} - -Result HierarchicalNamespaceDetector::Enabled(const std::string& container_name) { - // Hierarchical namespace can't easily be changed after the storage account is created - // and its common across all containers in the storage account. Do nothing until we've - // checked for a cached result. - if (enabled_.has_value()) { - return enabled_.value(); - } - - // This approach is inspired by hadoop-azure - // https://github.com/apache/hadoop/blob/7c6af6a5f626d18d68b656d085cc23e4c1f7a1ef/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/AzureBlobFileSystemStore.java#L356. - // Unfortunately `blob_service_client->GetAccountInfo()` requires significantly - // elevated permissions. - // https://learn.microsoft.com/en-us/rest/api/storageservices/get-blob-service-properties?tabs=azure-ad#authorization - auto filesystem_client = datalake_service_client_->GetFileSystemClient(container_name); - auto directory_client = filesystem_client.GetDirectoryClient("/"); - try { - directory_client.GetAccessControlList(); - enabled_ = true; - } catch (const Azure::Storage::StorageException& exception) { - // GetAccessControlList will fail on storage accounts without hierarchical - // namespace enabled. - - if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::BadRequest || - exception.StatusCode == Azure::Core::Http::HttpStatusCode::Conflict) { - // Flat namespace storage accounts with soft delete enabled return - // Conflict - This endpoint does not support BlobStorageEvents or SoftDelete - // otherwise it returns: BadRequest - This operation is only supported on a - // hierarchical namespace account. - enabled_ = false; - } else if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { - // Azurite returns NotFound. - try { - filesystem_client.GetProperties(); - enabled_ = false; - } catch (const Azure::Storage::StorageException& exception) { - return ExceptionToStatus("Failed to confirm '" + filesystem_client.GetUrl() + - "' is an accessible container. Therefore the " - "hierarchical namespace check was invalid.", - exception); - } - } else { - return ExceptionToStatus( - "GetAccessControlList for '" + directory_client.GetUrl() + - "' failed with an unexpected Azure error, while checking " - "whether the storage account has hierarchical namespace enabled.", - exception); - } - } - return enabled_.value(); -} - -} // namespace arrow::fs::internal diff --git a/cpp/src/arrow/filesystem/azurefs_internal.h b/cpp/src/arrow/filesystem/azurefs_internal.h deleted file mode 100644 index 92592cf164f5a..0000000000000 --- a/cpp/src/arrow/filesystem/azurefs_internal.h +++ /dev/null @@ -1,39 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -#include - -#include "arrow/result.h" - -namespace arrow::fs::internal { - -class HierarchicalNamespaceDetector { - public: - Status Init( - Azure::Storage::Files::DataLake::DataLakeServiceClient* datalake_service_client); - Result Enabled(const std::string& container_name); - - private: - Azure::Storage::Files::DataLake::DataLakeServiceClient* datalake_service_client_; - std::optional enabled_; -}; - -} // namespace arrow::fs::internal diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 8a39c4c554897..db0e133e0d453 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -34,7 +34,6 @@ #include #include "arrow/filesystem/azurefs.h" -#include "arrow/filesystem/azurefs_internal.h" #include #include @@ -520,7 +519,8 @@ class TestAzureFileSystem : public ::testing::Test { // Tests that are called from more than one implementation of TestAzureFileSystem - void TestDetectHierarchicalNamespace(); + void TestDetectHierarchicalNamespace(bool trip_up_azurite); + void TestDetectHierarchicalNamespaceOnMissingContainer(); void TestGetFileInfoObject(); void TestGetFileInfoObjectWithNestedStructure(); @@ -610,14 +610,49 @@ class TestAzureFileSystem : public ::testing::Test { } }; -void TestAzureFileSystem::TestDetectHierarchicalNamespace() { - // Check the environments are implemented and injected here correctly. - auto expected = WithHierarchicalNamespace(); +void TestAzureFileSystem::TestDetectHierarchicalNamespace(bool trip_up_azurite) { + EXPECT_OK_AND_ASSIGN(auto env, GetAzureEnv()); + if (trip_up_azurite && env->backend() != AzureBackend::kAzurite) { + GTEST_SKIP() << "trip_up_azurite=true is only for Azurite."; + } auto data = SetUpPreexistingData(); - auto hierarchical_namespace = internal::HierarchicalNamespaceDetector(); - ASSERT_OK(hierarchical_namespace.Init(datalake_service_client_.get())); - ASSERT_OK_AND_EQ(expected, hierarchical_namespace.Enabled(data.container_name)); + if (trip_up_azurite) { + // Azurite causes GetDirectoryClient("/") to throw a std::out_of_range + // exception when a "/" blob exists, so we exercise that code path. + auto container_client = + blob_service_client_->GetBlobContainerClient(data.container_name); + CreateBlob(container_client, "/"); + } + + auto adlfs_client = datalake_service_client_->GetFileSystemClient(data.container_name); + ASSERT_OK_AND_ASSIGN(auto hns_support, internal::CheckIfHierarchicalNamespaceIsEnabled( + adlfs_client, options_)); + if (env->WithHierarchicalNamespace()) { + ASSERT_EQ(hns_support, internal::HNSSupport::kEnabled); + } else { + ASSERT_EQ(hns_support, internal::HNSSupport::kDisabled); + } +} + +void TestAzureFileSystem::TestDetectHierarchicalNamespaceOnMissingContainer() { + auto container_name = PreexistingData::RandomContainerName(rng_); + auto adlfs_client = datalake_service_client_->GetFileSystemClient(container_name); + ASSERT_OK_AND_ASSIGN(auto hns_support, internal::CheckIfHierarchicalNamespaceIsEnabled( + adlfs_client, options_)); + EXPECT_OK_AND_ASSIGN(auto env, GetAzureEnv()); + switch (env->backend()) { + case AzureBackend::kAzurite: + ASSERT_EQ(hns_support, internal::HNSSupport::kDisabled); + break; + case AzureBackend::kAzure: + if (env->WithHierarchicalNamespace()) { + ASSERT_EQ(hns_support, internal::HNSSupport::kContainerNotFound); + } else { + ASSERT_EQ(hns_support, internal::HNSSupport::kDisabled); + } + break; + } } void TestAzureFileSystem::TestGetFileInfoObject() { @@ -733,7 +768,12 @@ using AllEnvironments = TYPED_TEST_SUITE(AzureFileSystemTestOnAllEnvs, AllEnvironments); TYPED_TEST(AzureFileSystemTestOnAllEnvs, DetectHierarchicalNamespace) { - this->TestDetectHierarchicalNamespace(); + this->TestDetectHierarchicalNamespace(true); + this->TestDetectHierarchicalNamespace(false); +} + +TYPED_TEST(AzureFileSystemTestOnAllEnvs, DetectHierarchicalNamespaceOnMissingContainer) { + this->TestDetectHierarchicalNamespaceOnMissingContainer(); } TYPED_TEST(AzureFileSystemTestOnAllEnvs, GetFileInfoObject) { @@ -817,12 +857,6 @@ TEST_F(TestAzureHierarchicalNSFileSystem, DeleteDirContentsFailureNonexistent) { // Tests using Azurite (the local Azure emulator) -TEST_F(TestAzuriteFileSystem, DetectHierarchicalNamespaceFailsWithMissingContainer) { - auto hierarchical_namespace = internal::HierarchicalNamespaceDetector(); - ASSERT_OK(hierarchical_namespace.Init(datalake_service_client_.get())); - ASSERT_RAISES(IOError, hierarchical_namespace.Enabled("nonexistent-container")); -} - TEST_F(TestAzuriteFileSystem, GetFileInfoAccount) { AssertFileInfo(fs_.get(), "", FileType::Directory); From 708b9733fc0797dabec75cbaa3d4564ffd483fef Mon Sep 17 00:00:00 2001 From: Bob Plotts Date: Wed, 20 Dec 2023 12:21:33 -0500 Subject: [PATCH 079/570] GH-39288: [Java][FlightSQL] Update Apache Avatica to version 1.24.0 (#39325) Updated pom files, and updated several failing tests because UsernamePasswordCredentials() method has been removed from Avatica. * Closes: #39288 Authored-by: Bob Plotts Signed-off-by: David Li --- java/flight/flight-sql-jdbc-core/pom.xml | 2 +- .../driver/jdbc/ConnectionMutualTlsTest.java | 19 ++++++------------- .../jdbc/ConnectionTlsRootCertsTest.java | 7 ++----- .../arrow/driver/jdbc/ConnectionTlsTest.java | 13 ++++--------- java/flight/flight-sql-jdbc-driver/pom.xml | 2 +- 5 files changed, 14 insertions(+), 29 deletions(-) diff --git a/java/flight/flight-sql-jdbc-core/pom.xml b/java/flight/flight-sql-jdbc-core/pom.xml index 74a2f8d320f37..6c577954f8fc5 100644 --- a/java/flight/flight-sql-jdbc-core/pom.xml +++ b/java/flight/flight-sql-jdbc-core/pom.xml @@ -128,7 +128,7 @@ org.apache.calcite.avatica avatica - 1.18.0 + 1.24.0 diff --git a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/ConnectionMutualTlsTest.java b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/ConnectionMutualTlsTest.java index 927b3e426c6ba..cc44cc57be9b3 100644 --- a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/ConnectionMutualTlsTest.java +++ b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/ConnectionMutualTlsTest.java @@ -36,7 +36,6 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.util.AutoCloseables; -import org.apache.calcite.avatica.org.apache.http.auth.UsernamePasswordCredentials; import org.junit.After; import org.junit.Assert; import org.junit.Before; @@ -112,15 +111,13 @@ public void tearDown() throws Exception { */ @Test public void testGetEncryptedClientAuthenticated() throws Exception { - final UsernamePasswordCredentials credentials = new UsernamePasswordCredentials( - userTest, passTest); try (ArrowFlightSqlClientHandler client = new ArrowFlightSqlClientHandler.Builder() .withHost(FLIGHT_SERVER_TEST_RULE.getHost()) .withPort(FLIGHT_SERVER_TEST_RULE.getPort()) - .withUsername(credentials.getUserName()) - .withPassword(credentials.getPassword()) + .withUsername(userTest) + .withPassword(passTest) .withTlsRootCertificates(tlsRootCertsPath) .withClientCertificate(clientMTlsCertPath) .withClientKey(clientMTlsKeyPath) @@ -137,15 +134,13 @@ public void testGetEncryptedClientAuthenticated() throws Exception { */ @Test public void testGetEncryptedClientWithBadMTlsCertPath() { - final UsernamePasswordCredentials credentials = new UsernamePasswordCredentials( - userTest, passTest); assertThrows(SQLException.class, () -> { try (ArrowFlightSqlClientHandler handler = new ArrowFlightSqlClientHandler.Builder() .withHost(FLIGHT_SERVER_TEST_RULE.getHost()) .withPort(FLIGHT_SERVER_TEST_RULE.getPort()) - .withUsername(credentials.getUserName()) - .withPassword(credentials.getPassword()) + .withUsername(userTest) + .withPassword(passTest) .withTlsRootCertificates(tlsRootCertsPath) .withClientCertificate(badClientMTlsCertPath) .withClientKey(clientMTlsKeyPath) @@ -163,15 +158,13 @@ public void testGetEncryptedClientWithBadMTlsCertPath() { */ @Test public void testGetEncryptedClientWithBadMTlsKeyPath() { - final UsernamePasswordCredentials credentials = new UsernamePasswordCredentials( - userTest, passTest); assertThrows(SQLException.class, () -> { try (ArrowFlightSqlClientHandler handler = new ArrowFlightSqlClientHandler.Builder() .withHost(FLIGHT_SERVER_TEST_RULE.getHost()) .withPort(FLIGHT_SERVER_TEST_RULE.getPort()) - .withUsername(credentials.getUserName()) - .withPassword(credentials.getPassword()) + .withUsername(userTest) + .withPassword(passTest) .withTlsRootCertificates(tlsRootCertsPath) .withClientCertificate(clientMTlsCertPath) .withClientKey(badClientMTlsKeyPath) diff --git a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/ConnectionTlsRootCertsTest.java b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/ConnectionTlsRootCertsTest.java index 5579cf0cf5f54..e5ffc2bcf79c8 100644 --- a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/ConnectionTlsRootCertsTest.java +++ b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/ConnectionTlsRootCertsTest.java @@ -35,7 +35,6 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.util.AutoCloseables; -import org.apache.calcite.avatica.org.apache.http.auth.UsernamePasswordCredentials; import org.junit.After; import org.junit.Assert; import org.junit.Before; @@ -95,15 +94,13 @@ public void tearDown() throws Exception { */ @Test public void testGetEncryptedClientAuthenticated() throws Exception { - final UsernamePasswordCredentials credentials = new UsernamePasswordCredentials( - userTest, passTest); try (ArrowFlightSqlClientHandler client = new ArrowFlightSqlClientHandler.Builder() .withHost(FLIGHT_SERVER_TEST_RULE.getHost()) .withPort(FLIGHT_SERVER_TEST_RULE.getPort()) - .withUsername(credentials.getUserName()) - .withPassword(credentials.getPassword()) + .withUsername(userTest) + .withPassword(passTest) .withTlsRootCertificates(tlsRootCertsPath) .withBufferAllocator(allocator) .withEncryption(true) diff --git a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/ConnectionTlsTest.java b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/ConnectionTlsTest.java index 7e160f3f0c385..f5a7b68e06cd8 100644 --- a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/ConnectionTlsTest.java +++ b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/ConnectionTlsTest.java @@ -36,7 +36,6 @@ import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.util.AutoCloseables; import org.apache.arrow.util.Preconditions; -import org.apache.calcite.avatica.org.apache.http.auth.UsernamePasswordCredentials; import org.junit.After; import org.junit.Assert; import org.junit.Before; @@ -96,15 +95,13 @@ public void tearDown() throws Exception { */ @Test public void testGetEncryptedClientAuthenticatedWithDisableCertVerification() throws Exception { - final UsernamePasswordCredentials credentials = new UsernamePasswordCredentials( - userTest, passTest); try (ArrowFlightSqlClientHandler client = new ArrowFlightSqlClientHandler.Builder() .withHost(FLIGHT_SERVER_TEST_RULE.getHost()) .withPort(FLIGHT_SERVER_TEST_RULE.getPort()) - .withUsername(credentials.getUserName()) - .withPassword(credentials.getPassword()) + .withUsername(userTest) + .withPassword(passTest) .withDisableCertificateVerification(true) .withBufferAllocator(allocator) .withEncryption(true) @@ -120,16 +117,14 @@ public void testGetEncryptedClientAuthenticatedWithDisableCertVerification() thr */ @Test public void testGetEncryptedClientAuthenticated() throws Exception { - final UsernamePasswordCredentials credentials = new UsernamePasswordCredentials( - userTest, passTest); try (ArrowFlightSqlClientHandler client = new ArrowFlightSqlClientHandler.Builder() .withHost(FLIGHT_SERVER_TEST_RULE.getHost()) .withPort(FLIGHT_SERVER_TEST_RULE.getPort()) .withSystemTrustStore(false) - .withUsername(credentials.getUserName()) - .withPassword(credentials.getPassword()) + .withUsername(userTest) + .withPassword(passTest) .withTrustStorePath(trustStorePath) .withTrustStorePassword(trustStorePass) .withBufferAllocator(allocator) diff --git a/java/flight/flight-sql-jdbc-driver/pom.xml b/java/flight/flight-sql-jdbc-driver/pom.xml index d4ef1b4ea3b9b..a8e55ea4089d5 100644 --- a/java/flight/flight-sql-jdbc-driver/pom.xml +++ b/java/flight/flight-sql-jdbc-driver/pom.xml @@ -114,7 +114,7 @@ org.apache.calcite.avatica avatica - 1.18.0 + 1.24.0 runtime From 91b2243e2753bb1a4ccd645dd41d74b1d0b077c0 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 20 Dec 2023 15:28:29 -0300 Subject: [PATCH 080/570] GH-39322: [C++] Forward arguments to ExceptionToStatus all the way to Status::FromArgs (#39323) ### Rationale for this change This simplifies the creation of long error messages and leads to the use of a string builder to construct the error message. ### What changes are included in this PR? - std::forward in ExceptionToStatus - A few nitpicky changes - Simplification of the error message text - Moving the signature of `CheckIfHierarchicalNamespaceIsEnabled` to `azurefs_internal.h` to reduce the size of `azurefs.h` -- implementation remains in `azurefs.cc` ### Are these changes tested? Yes. By existing tests. * Closes: #39322 Authored-by: Felipe Oliveira Carvalho Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/filesystem/azurefs.cc | 195 +++++++++----------- cpp/src/arrow/filesystem/azurefs.h | 48 ----- cpp/src/arrow/filesystem/azurefs_internal.h | 78 ++++++++ cpp/src/arrow/filesystem/azurefs_test.cc | 13 +- 4 files changed, 177 insertions(+), 157 deletions(-) create mode 100644 cpp/src/arrow/filesystem/azurefs_internal.h diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 032cd034e7abb..a9795e40a6ce8 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -16,6 +16,7 @@ // under the License. #include "arrow/filesystem/azurefs.h" +#include "arrow/filesystem/azurefs_internal.h" #include #include @@ -41,7 +42,7 @@ namespace DataLake = Azure::Storage::Files::DataLake; namespace Http = Azure::Core::Http; namespace Storage = Azure::Storage; -using internal::HNSSupport; +using HNSSupport = internal::HierarchicalNamespaceSupport; // ----------------------------------------------------------------------- // AzureOptions Implementation @@ -217,9 +218,11 @@ struct AzureLocation { } }; -Status ExceptionToStatus(const std::string& prefix, - const Azure::Storage::StorageException& exception) { - return Status::IOError(prefix, " Azure Error: ", exception.what()); +template +Status ExceptionToStatus(const Storage::StorageException& exception, + PrefixArgs&&... prefix_args) { + return Status::IOError(std::forward(prefix_args)..., + " Azure Error: ", exception.what()); } Status PathNotFound(const AzureLocation& location) { @@ -418,6 +421,15 @@ std::shared_ptr PropertiesToMetadata( return metadata; } +Storage::Metadata ArrowMetadataToAzureMetadata( + const std::shared_ptr& arrow_metadata) { + Storage::Metadata azure_metadata; + for (auto key_value : arrow_metadata->sorted_pairs()) { + azure_metadata[key_value.first] = key_value.second; + } + return azure_metadata; +} + class ObjectInputFile final : public io::RandomAccessFile { public: ObjectInputFile(std::shared_ptr blob_client, @@ -443,9 +455,8 @@ class ObjectInputFile final : public io::RandomAccessFile { return PathNotFound(location_); } return ExceptionToStatus( - "GetProperties failed for '" + blob_client_->GetUrl() + - "'. Cannot initialise an ObjectInputFile without knowing the file size.", - exception); + exception, "GetProperties failed for '", blob_client_->GetUrl(), + "'. Cannot initialise an ObjectInputFile without knowing the file size."); } } @@ -523,10 +534,9 @@ class ObjectInputFile final : public io::RandomAccessFile { .Value.ContentRange.Length.Value(); } catch (const Storage::StorageException& exception) { return ExceptionToStatus( - "DownloadTo from '" + blob_client_->GetUrl() + "' at position " + - std::to_string(position) + " for " + std::to_string(nbytes) + - " bytes failed. ReadAt failed to read the required byte range.", - exception); + exception, "DownloadTo from '", blob_client_->GetUrl(), "' at position ", + position, " for ", nbytes, + " bytes failed. ReadAt failed to read the required byte range."); } } @@ -571,15 +581,14 @@ class ObjectInputFile final : public io::RandomAccessFile { std::shared_ptr metadata_; }; -Status CreateEmptyBlockBlob(std::shared_ptr block_blob_client) { +Status CreateEmptyBlockBlob(const Blobs::BlockBlobClient& block_blob_client) { try { - block_blob_client->UploadFrom(nullptr, 0); + block_blob_client.UploadFrom(nullptr, 0); } catch (const Storage::StorageException& exception) { return ExceptionToStatus( - "UploadFrom failed for '" + block_blob_client->GetUrl() + - "'. There is no existing blob at this location or the existing blob must be " - "replaced so ObjectAppendStream must create a new empty block blob.", - exception); + exception, "UploadFrom failed for '", block_blob_client.GetUrl(), + "'. There is no existing blob at this location or the existing blob must be " + "replaced so ObjectAppendStream must create a new empty block blob."); } return Status::OK(); } @@ -590,19 +599,9 @@ Result GetBlockList( return block_blob_client->GetBlockList().Value; } catch (Storage::StorageException& exception) { return ExceptionToStatus( - "GetBlockList failed for '" + block_blob_client->GetUrl() + - "'. Cannot write to a file without first fetching the existing block list.", - exception); - } -} - -Storage::Metadata ArrowMetadataToAzureMetadata( - const std::shared_ptr& arrow_metadata) { - Storage::Metadata azure_metadata; - for (auto key_value : arrow_metadata->sorted_pairs()) { - azure_metadata[key_value.first] = key_value.second; + exception, "GetBlockList failed for '", block_blob_client->GetUrl(), + "'. Cannot write to a file without first fetching the existing block list."); } - return azure_metadata; } Status CommitBlockList(std::shared_ptr block_blob_client, @@ -618,9 +617,8 @@ Status CommitBlockList(std::shared_ptr block_bl block_blob_client->CommitBlockList(block_ids, options); } catch (const Storage::StorageException& exception) { return ExceptionToStatus( - "CommitBlockList failed for '" + block_blob_client->GetUrl() + - "'. Committing is required to flush an output/append stream.", - exception); + exception, "CommitBlockList failed for '", block_blob_client->GetUrl(), + "'. Committing is required to flush an output/append stream."); } return Status::OK(); } @@ -659,13 +657,12 @@ class ObjectAppendStream final : public io::OutputStream { pos_ = content_length_; } catch (const Storage::StorageException& exception) { if (exception.StatusCode == Http::HttpStatusCode::NotFound) { - RETURN_NOT_OK(CreateEmptyBlockBlob(block_blob_client_)); + RETURN_NOT_OK(CreateEmptyBlockBlob(*block_blob_client_)); } else { return ExceptionToStatus( - "GetProperties failed for '" + block_blob_client_->GetUrl() + - "'. Cannot initialise an ObjectAppendStream without knowing whether a " - "file already exists at this path, and if it exists, its size.", - exception); + exception, "GetProperties failed for '", block_blob_client_->GetUrl(), + "'. Cannot initialise an ObjectAppendStream without knowing whether a " + "file already exists at this path, and if it exists, its size."); } content_length_ = 0; } @@ -760,10 +757,9 @@ class ObjectAppendStream final : public io::OutputStream { block_blob_client_->StageBlock(new_block_id, block_content); } catch (const Storage::StorageException& exception) { return ExceptionToStatus( - "StageBlock failed for '" + block_blob_client_->GetUrl() + "' new_block_id: '" + - new_block_id + - "'. Staging new blocks is fundamental to streaming writes to blob storage.", - exception); + exception, "StageBlock failed for '", block_blob_client_->GetUrl(), + "' new_block_id: '", new_block_id, + "'. Staging new blocks is fundamental to streaming writes to blob storage."); } block_ids_.push_back(new_block_id); pos_ += nbytes; @@ -835,9 +831,9 @@ Result CheckIfHierarchicalNamespaceIsEnabled( if (exception.ErrorCode == "HierarchicalNamespaceNotEnabled") { return HNSSupport::kDisabled; } - return ExceptionToStatus("Check for Hierarchical Namespace support on '" + - adlfs_client.GetUrl() + "' failed.", - exception); + return ExceptionToStatus(exception, + "Check for Hierarchical Namespace support on '", + adlfs_client.GetUrl(), "' failed."); } } } @@ -855,6 +851,8 @@ namespace { // "filesystem". Creating a container using the Blob Storage API will make it // accessible using the Data Lake Storage Gen 2 API and vice versa. +const char kDelimiter[] = {internal::kSep, '\0'}; + template Result GetContainerPropsAsFileInfo(const std::string& container_name, ContainerClient& container_client) { @@ -869,8 +867,8 @@ Result GetContainerPropsAsFileInfo(const std::string& container_name, info.set_type(FileType::NotFound); return info; } - return ExceptionToStatus( - "GetProperties for '" + container_client.GetUrl() + "' failed.", exception); + return ExceptionToStatus(exception, "GetProperties for '", container_client.GetUrl(), + "' failed."); } } @@ -1011,16 +1009,14 @@ class AzureFileSystem::Impl { return info; } catch (const Storage::StorageException& exception) { return ExceptionToStatus( - "ListBlobs failed for prefix='" + *list_blob_options.Prefix + - "' failed. GetFileInfo is unable to determine whether the path should " - "be considered an implied directory.", - exception); + exception, "ListBlobs failed for prefix='", *list_blob_options.Prefix, + "' failed. GetFileInfo is unable to determine whether the path should " + "be considered an implied directory."); } } return ExceptionToStatus( - "GetProperties failed for '" + file_client.GetUrl() + - "' GetFileInfo is unable to determine whether the path exists.", - exception); + exception, "GetProperties failed for '", file_client.GetUrl(), + "' GetFileInfo is unable to determine whether the path exists."); } } @@ -1038,7 +1034,7 @@ class AzureFileSystem::Impl { } } } catch (const Storage::StorageException& exception) { - return ExceptionToStatus("Failed to list account containers.", exception); + return ExceptionToStatus(exception, "Failed to list account containers."); } return Status::OK(); } @@ -1153,9 +1149,9 @@ class AzureFileSystem::Impl { if (IsContainerNotFound(exception)) { found = false; } else { - return ExceptionToStatus("Failed to list blobs in a directory: " + - select.base_dir + ": " + container_client.GetUrl(), - exception); + return ExceptionToStatus(exception, + "Failed to list blobs in a directory: ", select.base_dir, + ": ", container_client.GetUrl()); } } @@ -1241,7 +1237,7 @@ class AzureFileSystem::Impl { Status CreateDir(const AzureLocation& location) { if (location.container.empty()) { - return Status::Invalid("Cannot create an empty container"); + return Status::Invalid("CreateDir requires a non-empty path."); } auto container_client = @@ -1249,17 +1245,13 @@ class AzureFileSystem::Impl { if (location.path.empty()) { try { auto response = container_client.Create(); - if (response.Value.Created) { - return Status::OK(); - } else { - return StatusFromErrorResponse( - container_client.GetUrl(), *response.RawResponse, - "Failed to create a container: " + location.container); - } + return response.Value.Created + ? Status::OK() + : Status::AlreadyExists("Directory already exists: " + location.all); } catch (const Storage::StorageException& exception) { - return ExceptionToStatus("Failed to create a container: " + location.container + - ": " + container_client.GetUrl(), - exception); + return ExceptionToStatus(exception, + "Failed to create a container: ", location.container, + ": ", container_client.GetUrl()); } } @@ -1291,15 +1283,14 @@ class AzureFileSystem::Impl { "Failed to create a directory: " + location.path); } } catch (const Storage::StorageException& exception) { - return ExceptionToStatus("Failed to create a directory: " + location.path + ": " + - directory_client.GetUrl(), - exception); + return ExceptionToStatus(exception, "Failed to create a directory: ", location.path, + ": ", directory_client.GetUrl()); } } Status CreateDirRecursive(const AzureLocation& location) { if (location.container.empty()) { - return Status::Invalid("Cannot create an empty container"); + return Status::Invalid("CreateDir requires a non-empty path."); } auto container_client = @@ -1307,9 +1298,9 @@ class AzureFileSystem::Impl { try { container_client.CreateIfNotExists(); } catch (const Storage::StorageException& exception) { - return ExceptionToStatus("Failed to create a container: " + location.container + - " (" + container_client.GetUrl() + ")", - exception); + return ExceptionToStatus(exception, + "Failed to create a container: ", location.container, " (", + container_client.GetUrl(), ")"); } auto adlfs_client = datalake_service_client_->GetFileSystemClient(location.container); @@ -1328,9 +1319,9 @@ class AzureFileSystem::Impl { try { directory_client.CreateIfNotExists(); } catch (const Storage::StorageException& exception) { - return ExceptionToStatus("Failed to create a directory: " + location.path + " (" + - directory_client.GetUrl() + ")", - exception); + return ExceptionToStatus(exception, + "Failed to create a directory: ", location.path, " (", + directory_client.GetUrl(), ")"); } } @@ -1349,7 +1340,7 @@ class AzureFileSystem::Impl { std::shared_ptr stream; if (truncate) { - RETURN_NOT_OK(CreateEmptyBlockBlob(block_blob_client)); + RETURN_NOT_OK(CreateEmptyBlockBlob(*block_blob_client)); stream = std::make_shared(block_blob_client, fs->io_context(), location, metadata, options_, 0); } else { @@ -1393,9 +1384,8 @@ class AzureFileSystem::Impl { try { container_client.SubmitBatch(batch); } catch (const Storage::StorageException& exception) { - return ExceptionToStatus("Failed to delete blobs in a directory: " + - location.path + ": " + container_client.GetUrl(), - exception); + return ExceptionToStatus(exception, "Failed to delete blobs in a directory: ", + location.path, ": ", container_client.GetUrl()); } std::vector failed_blob_names; for (size_t i = 0; i < deferred_responses.size(); ++i) { @@ -1424,9 +1414,9 @@ class AzureFileSystem::Impl { } } } catch (const Storage::StorageException& exception) { - return ExceptionToStatus("Failed to list blobs in a directory: " + location.path + - ": " + container_client.GetUrl(), - exception); + return ExceptionToStatus(exception, + "Failed to list blobs in a directory: ", location.path, + ": ", container_client.GetUrl()); } return Status::OK(); } @@ -1434,7 +1424,7 @@ class AzureFileSystem::Impl { public: Status DeleteDir(const AzureLocation& location) { if (location.container.empty()) { - return Status::Invalid("Cannot delete an empty container"); + return Status::Invalid("DeleteDir requires a non-empty path."); } auto adlfs_client = datalake_service_client_->GetFileSystemClient(location.container); @@ -1456,9 +1446,9 @@ class AzureFileSystem::Impl { "Failed to delete a container: " + location.container); } } catch (const Storage::StorageException& exception) { - return ExceptionToStatus("Failed to delete a container: " + location.container + - ": " + container_client.GetUrl(), - exception); + return ExceptionToStatus(exception, + "Failed to delete a container: ", location.container, + ": ", container_client.GetUrl()); } } @@ -1474,9 +1464,9 @@ class AzureFileSystem::Impl { "Failed to delete a directory: " + location.path); } } catch (const Storage::StorageException& exception) { - return ExceptionToStatus("Failed to delete a directory: " + location.path + ": " + - directory_client.GetUrl(), - exception); + return ExceptionToStatus(exception, + "Failed to delete a directory: ", location.path, ": ", + directory_client.GetUrl()); } } else { return DeleteDirContentsWithoutHierarchicalNamespace(location, @@ -1507,9 +1497,8 @@ class AzureFileSystem::Impl { sub_directory_client.DeleteRecursive(); } catch (const Storage::StorageException& exception) { return ExceptionToStatus( - "Failed to delete a sub directory: " + location.container + - internal::kSep + path.Name + ": " + sub_directory_client.GetUrl(), - exception); + exception, "Failed to delete a sub directory: ", location.container, + kDelimiter, path.Name, ": ", sub_directory_client.GetUrl()); } } else { auto sub_file_client = adlfs_client.GetFileClient(path.Name); @@ -1517,9 +1506,8 @@ class AzureFileSystem::Impl { sub_file_client.Delete(); } catch (const Storage::StorageException& exception) { return ExceptionToStatus( - "Failed to delete a sub file: " + location.container + - internal::kSep + path.Name + ": " + sub_file_client.GetUrl(), - exception); + exception, "Failed to delete a sub file: ", location.container, + kDelimiter, path.Name, ": ", sub_file_client.GetUrl()); } } } @@ -1528,9 +1516,9 @@ class AzureFileSystem::Impl { if (missing_dir_ok && exception.StatusCode == Http::HttpStatusCode::NotFound) { return Status::OK(); } else { - return ExceptionToStatus("Failed to delete directory contents: " + - location.path + ": " + directory_client.GetUrl(), - exception); + return ExceptionToStatus(exception, + "Failed to delete directory contents: ", location.path, + ": ", directory_client.GetUrl()); } } return Status::OK(); @@ -1553,9 +1541,8 @@ class AzureFileSystem::Impl { try { dest_blob_client.CopyFromUri(src_url); } catch (const Storage::StorageException& exception) { - return ExceptionToStatus( - "Failed to copy a blob. (" + src_url + " -> " + dest_blob_client.GetUrl() + ")", - exception); + return ExceptionToStatus(exception, "Failed to copy a blob. (", src_url, " -> ", + dest_blob_client.GetUrl(), ")"); } return Status::OK(); } diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h index b7ef2bb3130c2..0c41c42928121 100644 --- a/cpp/src/arrow/filesystem/azurefs.h +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -118,54 +118,6 @@ struct ARROW_EXPORT AzureOptions { MakeDataLakeServiceClient() const; }; -namespace internal { - -enum class HNSSupport { - kUnknown = 0, - kContainerNotFound = 1, - kDisabled = 2, - kEnabled = 3, -}; - -/// \brief Performs a request to check if the storage account has Hierarchical -/// Namespace support enabled. -/// -/// This check requires a DataLakeFileSystemClient for any container of the -/// storage account. If the container doesn't exist yet, we just forward that -/// error to the caller (kContainerNotFound) since that's a proper error to the operation -/// on that container anyways -- no need to try again with or without the knowledge of -/// Hierarchical Namespace support. -/// -/// Hierarchical Namespace support can't easily be changed after the storage account is -/// created and the feature is shared by all containers in the storage account. -/// This means the result of this check can (and should!) be cached as soon as -/// it returns a successful result on any container of the storage account (see -/// AzureFileSystem::Impl). -/// -/// The check consists of a call to DataLakeFileSystemClient::GetAccessControlList() -/// on the root directory of the container. An approach taken by the Hadoop Azure -/// project [1]. A more obvious approach would be to call -/// BlobServiceClient::GetAccountInfo(), but that endpoint requires elevated -/// permissions [2] that we can't generally rely on. -/// -/// [1]: -/// https://github.com/apache/hadoop/blob/7c6af6a5f626d18d68b656d085cc23e4c1f7a1ef/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/AzureBlobFileSystemStore.java#L356. -/// [2]: -/// https://learn.microsoft.com/en-us/rest/api/storageservices/get-blob-service-properties?tabs=azure-ad#authorization -/// -/// IMPORTANT: If the result is kEnabled or kDisabled, it doesn't necessarily mean that -/// the container exists. -/// -/// \param adlfs_client A DataLakeFileSystemClient for a container of the storage -/// account. -/// \return kEnabled/kDisabled/kContainerNotFound (kUnknown is never -/// returned). -Result CheckIfHierarchicalNamespaceIsEnabled( - Azure::Storage::Files::DataLake::DataLakeFileSystemClient& adlfs_client, - const AzureOptions& options); - -} // namespace internal - /// \brief FileSystem implementation backed by Azure Blob Storage (ABS) [1] and /// Azure Data Lake Storage Gen2 (ADLS Gen2) [2]. /// diff --git a/cpp/src/arrow/filesystem/azurefs_internal.h b/cpp/src/arrow/filesystem/azurefs_internal.h new file mode 100644 index 0000000000000..13d84c9b542b4 --- /dev/null +++ b/cpp/src/arrow/filesystem/azurefs_internal.h @@ -0,0 +1,78 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/result.h" + +namespace Azure::Storage::Files::DataLake { +class DataLakeFileSystemClient; +class DataLakeServiceClient; +} // namespace Azure::Storage::Files::DataLake + +namespace arrow::fs { + +struct AzureOptions; + +namespace internal { + +enum class HierarchicalNamespaceSupport { + kUnknown = 0, + kContainerNotFound = 1, + kDisabled = 2, + kEnabled = 3, +}; + +/// \brief Performs a request to check if the storage account has Hierarchical +/// Namespace support enabled. +/// +/// This check requires a DataLakeFileSystemClient for any container of the +/// storage account. If the container doesn't exist yet, we just forward that +/// error to the caller (kContainerNotFound) since that's a proper error to the operation +/// on that container anyways -- no need to try again with or without the knowledge of +/// Hierarchical Namespace support. +/// +/// Hierarchical Namespace support can't easily be changed after the storage account is +/// created and the feature is shared by all containers in the storage account. +/// This means the result of this check can (and should!) be cached as soon as +/// it returns a successful result on any container of the storage account (see +/// AzureFileSystem::Impl). +/// +/// The check consists of a call to DataLakeFileSystemClient::GetAccessControlList() +/// on the root directory of the container. An approach taken by the Hadoop Azure +/// project [1]. A more obvious approach would be to call +/// BlobServiceClient::GetAccountInfo(), but that endpoint requires elevated +/// permissions [2] that we can't generally rely on. +/// +/// [1]: +/// https://github.com/apache/hadoop/blob/7c6af6a5f626d18d68b656d085cc23e4c1f7a1ef/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/AzureBlobFileSystemStore.java#L356. +/// [2]: +/// https://learn.microsoft.com/en-us/rest/api/storageservices/get-blob-service-properties?tabs=azure-ad#authorization +/// +/// IMPORTANT: If the result is kEnabled or kDisabled, it doesn't necessarily mean that +/// the container exists. +/// +/// \param adlfs_client A DataLakeFileSystemClient for a container of the storage +/// account. +/// \return kEnabled/kDisabled/kContainerNotFound (kUnknown is never +/// returned). +Result CheckIfHierarchicalNamespaceIsEnabled( + Azure::Storage::Files::DataLake::DataLakeFileSystemClient& adlfs_client, + const arrow::fs::AzureOptions& options); + +} // namespace internal +} // namespace arrow::fs diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index db0e133e0d453..53e71f3658dd9 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -34,6 +34,7 @@ #include #include "arrow/filesystem/azurefs.h" +#include "arrow/filesystem/azurefs_internal.h" #include #include @@ -72,6 +73,8 @@ namespace Blobs = Azure::Storage::Blobs; namespace Core = Azure::Core; namespace DataLake = Azure::Storage::Files::DataLake; +using HNSSupport = internal::HierarchicalNamespaceSupport; + enum class AzureBackend { /// \brief Official Azure Remote Backend kAzure, @@ -629,9 +632,9 @@ void TestAzureFileSystem::TestDetectHierarchicalNamespace(bool trip_up_azurite) ASSERT_OK_AND_ASSIGN(auto hns_support, internal::CheckIfHierarchicalNamespaceIsEnabled( adlfs_client, options_)); if (env->WithHierarchicalNamespace()) { - ASSERT_EQ(hns_support, internal::HNSSupport::kEnabled); + ASSERT_EQ(hns_support, HNSSupport::kEnabled); } else { - ASSERT_EQ(hns_support, internal::HNSSupport::kDisabled); + ASSERT_EQ(hns_support, HNSSupport::kDisabled); } } @@ -643,13 +646,13 @@ void TestAzureFileSystem::TestDetectHierarchicalNamespaceOnMissingContainer() { EXPECT_OK_AND_ASSIGN(auto env, GetAzureEnv()); switch (env->backend()) { case AzureBackend::kAzurite: - ASSERT_EQ(hns_support, internal::HNSSupport::kDisabled); + ASSERT_EQ(hns_support, HNSSupport::kDisabled); break; case AzureBackend::kAzure: if (env->WithHierarchicalNamespace()) { - ASSERT_EQ(hns_support, internal::HNSSupport::kContainerNotFound); + ASSERT_EQ(hns_support, HNSSupport::kContainerNotFound); } else { - ASSERT_EQ(hns_support, internal::HNSSupport::kDisabled); + ASSERT_EQ(hns_support, HNSSupport::kDisabled); } break; } From 37616a8da57e4d98c82e8213ba1999cff4354334 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Diego=20Fern=C3=A1ndez=20Giraldo?= Date: Wed, 20 Dec 2023 13:14:11 -0700 Subject: [PATCH 081/570] GH-39328: [Java] Make default getConsumer public (#39329) ### Rationale for this change This can be useful for people implementing their own getConsumer ### What changes are included in this PR? Make the default getConsumer public. ### Are these changes tested? N/A ### Are there any user-facing changes? Users can now call JdbcToArrowUtils.getConsumer * Closes: #39328 Authored-by: Diego Fernandez Signed-off-by: David Li --- .../apache/arrow/adapter/jdbc/JdbcToArrowUtils.java | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrowUtils.java b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrowUtils.java index f8a13b93b1ed8..b66e133785f42 100644 --- a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrowUtils.java +++ b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrowUtils.java @@ -430,7 +430,18 @@ public static void jdbcToArrowVectors(ResultSet rs, VectorSchemaRoot root, JdbcT } } - static JdbcConsumer getConsumer(ArrowType arrowType, int columnIndex, boolean nullable, + /** + * Default function used for JdbcConsumerFactory. This function gets a JdbcConsumer for the + * given column based on the Arrow type and provided vector. + * + * @param arrowType Arrow type for the column. + * @param columnIndex Column index to fetch from the ResultSet + * @param nullable Whether the value is nullable or not + * @param vector Vector to store the consumed value + * @param config Associated JdbcToArrowConfig, used mainly for the Calendar. + * @return {@link JdbcConsumer} + */ + public static JdbcConsumer getConsumer(ArrowType arrowType, int columnIndex, boolean nullable, FieldVector vector, JdbcToArrowConfig config) { final Calendar calendar = config.getCalendar(); From 3c66491846a24f17014b31a22fafdda0229f881a Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Thu, 21 Dec 2023 00:29:47 +0000 Subject: [PATCH 082/570] GH-39318: [C++][FS][Azure] Add workload identity auth configuration (#39319) ### Rationale for this change Workload identity is a useful Azure authentication method. ### What changes are included in this PR? Implement `AzureOptions::ConfigureWorkloadIdentityCredential` ### Are these changes tested? Added a simple test initialising a fileystem using `ConfigureWorkloadIdentityCredential`. This is not the most comprehensive test but its the same as what we agreed on for https://github.com/apache/arrow/pull/39263. ### Are there any user-facing changes? Workload identity authentication is now supported. * Closes: #39318 Authored-by: Thomas Newton Signed-off-by: Sutou Kouhei --- cpp/src/arrow/filesystem/azurefs.cc | 7 +++++++ cpp/src/arrow/filesystem/azurefs.h | 2 ++ cpp/src/arrow/filesystem/azurefs_test.cc | 8 +++++++- 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index a9795e40a6ce8..d72ead92ed111 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -119,6 +119,13 @@ Status AzureOptions::ConfigureDefaultCredential(const std::string& account_name) return Status::OK(); } +Status AzureOptions::ConfigureWorkloadIdentityCredential( + const std::string& account_name) { + credential_kind_ = CredentialKind::kTokenCredential; + token_credential_ = std::make_shared(); + return Status::OK(); +} + Result> AzureOptions::MakeBlobServiceClient() const { switch (credential_kind_) { diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h index 0c41c42928121..be3ca5ba238ae 100644 --- a/cpp/src/arrow/filesystem/azurefs.h +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -103,6 +103,8 @@ struct ARROW_EXPORT AzureOptions { Status ConfigureDefaultCredential(const std::string& account_name); + Status ConfigureWorkloadIdentityCredential(const std::string& account_name); + Status ConfigureAccountKeyCredential(const std::string& account_name, const std::string& account_key); diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 53e71f3658dd9..ecf7522b98eef 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -275,7 +275,13 @@ class AzureHierarchicalNSEnv : public AzureEnvImpl { TEST(AzureFileSystem, InitializeFilesystemWithDefaultCredential) { AzureOptions options; ARROW_EXPECT_OK(options.ConfigureDefaultCredential("dummy-account-name")); - EXPECT_OK_AND_ASSIGN(auto default_credential_fs, AzureFileSystem::Make(options)); + EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options)); +} + +TEST(AzureFileSystem, InitializeFilesystemWithWorkloadIdentityCredential) { + AzureOptions options; + ARROW_EXPECT_OK(options.ConfigureWorkloadIdentityCredential("dummy-account-name")); + EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options)); } TEST(AzureFileSystem, OptionsCompare) { From 5df541de94b4cf76a8b9a1cad6155ae781ea55dc Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 21 Dec 2023 18:52:36 +0900 Subject: [PATCH 083/570] GH-39333: [C++] Don't use "if constexpr" in lambda (#39334) ### Rationale for this change It seems that it's not portable. At least it doesn't work as expected with Visual Studio 2017: C:/arrow/cpp/src/arrow/array/array_nested.cc(291): error C2065: 'validity': undeclared identifier (compiling source file C:\arrow-build\src\arrow\CMakeFiles\arrow_shared.dir\Unity\unity_0_cxx.cxx) [C:\arrow-build\src\arrow\arrow_shared.vcxproj] C:/arrow/cpp/src/arrow/array/array_nested.cc(660): note: see reference to function template instantiation 'arrow::Result> arrow::`anonymous-namespace'::FlattenListViewArray(const ListViewArrayT &,arrow::MemoryPool *)' being compiled with [ ListViewArrayT=arrow::ListViewArray ] (compiling source file C:\arrow-build\src\arrow\CMakeFiles\arrow_shared.dir\Unity\unity_0_cxx.cxx) memory_pool.cc C:/arrow/cpp/src/arrow/array/array_nested.cc(291): error C2065: 'list_view_array_offset': undeclared identifier (compiling source file C:\arrow-build\src\arrow\CMakeFiles\arrow_shared.dir\Unity\unity_0_cxx.cxx) [C:\arrow-build\src\arrow\arrow_shared.vcxproj] ### What changes are included in this PR? Avoid "if constexpr" in lambda. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * Closes: #39333 Lead-authored-by: Antoine Pitrou Co-authored-by: Sutou Kouhei Signed-off-by: Antoine Pitrou --- cpp/src/arrow/array/array_nested.cc | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index 03f3e5af29908..acdd0a0742468 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -287,10 +287,8 @@ Result> FlattenListViewArray(const ListViewArrayT& list_v const auto* sizes = list_view_array.data()->template GetValues(2); auto is_null_or_empty = [&](int64_t i) { - if constexpr (HasNulls) { - if (!bit_util::GetBit(validity, list_view_array_offset + i)) { - return true; - } + if (HasNulls && !bit_util::GetBit(validity, list_view_array_offset + i)) { + return true; } return sizes[i] == 0; }; From 2308cdfeaedf1af062c25d6edb2eeb1606fb105e Mon Sep 17 00:00:00 2001 From: "Rossi(Ruoxi) Sun" Date: Thu, 21 Dec 2023 01:55:47 -0800 Subject: [PATCH 084/570] GH-15192: [C++] Bring back `case_when` tests for union types (#39308) ### Rationale for this change Bring back the problematic test case of random `case_when` on union(bool, string) type. This case used to fail. However #36018 already addressed the issue. More information about how it used to fail, please refer to https://github.com/apache/arrow/issues/15192#issuecomment-1862252174. ### What changes are included in this PR? Bring back the test code. ### Are these changes tested? Yes, the change is the test. ### Are there any user-facing changes? No. * Closes: #15192 Authored-by: zanmato Signed-off-by: Antoine Pitrou --- .../compute/kernels/scalar_if_else_test.cc | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc index a11aab81742ed..771261cac9140 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc @@ -2485,16 +2485,14 @@ TEST(TestCaseWhen, UnionBoolString) { } } -// FIXME(GH-15192): enabling this test produces test failures - -// TEST(TestCaseWhen, UnionBoolStringRandom) { -// for (const auto& type : std::vector>{ -// sparse_union({field("a", boolean()), field("b", utf8())}, {2, 7}), -// dense_union({field("a", boolean()), field("b", utf8())}, {2, 7})}) { -// ARROW_SCOPED_TRACE(type->ToString()); -// TestCaseWhenRandom(type); -// } -// } +TEST(TestCaseWhen, UnionBoolStringRandom) { + for (const auto& type : std::vector>{ + sparse_union({field("a", boolean()), field("b", utf8())}, {2, 7}), + dense_union({field("a", boolean()), field("b", utf8())}, {2, 7})}) { + ARROW_SCOPED_TRACE(type->ToString()); + TestCaseWhenRandom(type); + } +} TEST(TestCaseWhen, DispatchBest) { CheckDispatchBest("case_when", {struct_({field("", boolean())}), int64(), int32()}, From 596259ee47b5c675b71432743d9bfd196efe08e3 Mon Sep 17 00:00:00 2001 From: Vibhatha Lakmal Abeykoon Date: Thu, 21 Dec 2023 19:02:38 +0530 Subject: [PATCH 085/570] GH-38725: [Java] decompression in Lz4CompressionCodec.java does not set writer index (#38840) ### Rationale for this change The `doDecompress` function in `Lz4CompressionCodec` misses writing the index when it is compared with the functionality in `ZstdCompressionCodec`. This PR fixes that issue. ### What changes are included in this PR? Writes the index for the decompressed ArrowBuf. ### Are these changes tested? No ### Are there any user-facing changes? No * Closes: #38725 Lead-authored-by: Vibhatha Lakmal Abeykoon Co-authored-by: vibhatha Signed-off-by: David Li --- .../org/apache/arrow/compression/Lz4CompressionCodec.java | 1 + .../apache/arrow/compression/TestCompressionCodec.java | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/java/compression/src/main/java/org/apache/arrow/compression/Lz4CompressionCodec.java b/java/compression/src/main/java/org/apache/arrow/compression/Lz4CompressionCodec.java index daa35b7e15be6..e8b780638e2c1 100644 --- a/java/compression/src/main/java/org/apache/arrow/compression/Lz4CompressionCodec.java +++ b/java/compression/src/main/java/org/apache/arrow/compression/Lz4CompressionCodec.java @@ -79,6 +79,7 @@ protected ArrowBuf doDecompress(BufferAllocator allocator, ArrowBuf compressedBu byte[] outBytes = out.toByteArray(); ArrowBuf decompressedBuffer = allocator.buffer(outBytes.length); decompressedBuffer.setBytes(/*index=*/0, outBytes); + decompressedBuffer.writerIndex(decompressedLength); return decompressedBuffer; } diff --git a/java/compression/src/test/java/org/apache/arrow/compression/TestCompressionCodec.java b/java/compression/src/test/java/org/apache/arrow/compression/TestCompressionCodec.java index 403130edba52e..01156fa2b0e0b 100644 --- a/java/compression/src/test/java/org/apache/arrow/compression/TestCompressionCodec.java +++ b/java/compression/src/test/java/org/apache/arrow/compression/TestCompressionCodec.java @@ -117,6 +117,12 @@ private List deCompressBuffers(CompressionCodec codec, List return outputBuffers; } + private void assertWriterIndex(List decompressedBuffers) { + for (ArrowBuf decompressedBuf : decompressedBuffers) { + assertTrue(decompressedBuf.writerIndex() > 0); + } + } + @ParameterizedTest @MethodSource("codecs") void testCompressFixedWidthBuffers(int vectorLength, CompressionCodec codec) throws Exception { @@ -139,6 +145,7 @@ void testCompressFixedWidthBuffers(int vectorLength, CompressionCodec codec) thr List decompressedBuffers = deCompressBuffers(codec, compressedBuffers); assertEquals(2, decompressedBuffers.size()); + assertWriterIndex(decompressedBuffers); // orchestrate new vector IntVector newVec = new IntVector("new vec", allocator); @@ -180,6 +187,7 @@ void testCompressVariableWidthBuffers(int vectorLength, CompressionCodec codec) List decompressedBuffers = deCompressBuffers(codec, compressedBuffers); assertEquals(3, decompressedBuffers.size()); + assertWriterIndex(decompressedBuffers); // orchestrate new vector VarCharVector newVec = new VarCharVector("new vec", allocator); From 2f9f892a0075d990a1b42dc97a97d490b6b08345 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 21 Dec 2023 15:53:41 +0100 Subject: [PATCH 086/570] GH-39196: [Python][Docs] Document the Arrow PyCapsule protocol in the 'extending pyarrow' section of the Python docs (#39199) ### Rationale for this change While the Arrow PyCapsule protocol itself is defined in the specification part of the docs, this PR adds a section about it in the Python user guide as well (referring to the specification for most details), where users might typically look for Python specific docs. * Closes: #39196 Lead-authored-by: Joris Van den Bossche Co-authored-by: Antoine Pitrou Signed-off-by: Joris Van den Bossche --- .../CDataInterface/PyCapsuleInterface.rst | 2 ++ docs/source/python/extending_types.rst | 32 +++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/docs/source/format/CDataInterface/PyCapsuleInterface.rst b/docs/source/format/CDataInterface/PyCapsuleInterface.rst index 0c1a01d7c6778..03095aa2e9356 100644 --- a/docs/source/format/CDataInterface/PyCapsuleInterface.rst +++ b/docs/source/format/CDataInterface/PyCapsuleInterface.rst @@ -16,6 +16,8 @@ .. under the License. +.. _arrow-pycapsule-interface: + ============================= The Arrow PyCapsule Interface ============================= diff --git a/docs/source/python/extending_types.rst b/docs/source/python/extending_types.rst index ee92cebcb549c..b7261005e66ee 100644 --- a/docs/source/python/extending_types.rst +++ b/docs/source/python/extending_types.rst @@ -21,6 +21,38 @@ Extending pyarrow ================= +Controlling conversion to (Py)Arrow with the PyCapsule Interface +---------------------------------------------------------------- + +The :ref:`Arrow C data interface ` allows moving Arrow data between +different implementations of Arrow. This is a generic, cross-language interface not +specific to Python, but for Python libraries this interface is extended with a Python +specific layer: :ref:`arrow-pycapsule-interface`. + +This Python interface ensures that different libraries that support the C Data interface +can export Arrow data structures in a standard way and recognize each other's objects. + +If you have a Python library providing data structures that hold Arrow-compatible data +under the hood, you can implement the following methods on those objects: + +- ``__arrow_c_schema__`` for schema or type-like objects. +- ``__arrow_c_array__`` for arrays and record batches (contiguous tables). +- ``__arrow_c_stream__`` for chunked tables or streams of data. + +Those methods return `PyCapsule `__ +objects, and more details on the exact semantics can be found in the +:ref:`specification `. + +When your data structures have those methods defined, the PyArrow constructors +(such as :func:`pyarrow.array` or :func:`pyarrow.table`) will recognize those objects as +supporting this protocol, and convert them to PyArrow data structures zero-copy. And the +same can be true for any other library supporting this protocol on ingesting data. + +Similarly, if your library has functions that accept user-provided data, you can add +support for this protocol by checking for the presence of those methods, and +therefore accept any Arrow data (instead of harcoding support for a specific +Arrow producer such as PyArrow). + .. _arrow_array_protocol: Controlling conversion to pyarrow.Array with the ``__arrow_array__`` protocol From 535b925bf073fb1af4e6e23ab54027f30dc8751f Mon Sep 17 00:00:00 2001 From: Jin Shang Date: Fri, 22 Dec 2023 01:34:06 +0800 Subject: [PATCH 087/570] GH-39232: [C++] Support binary to fixed_size_binary cast (#39236) ### Rationale for this change Add binary to fixed_size_binary cast. ### What changes are included in this PR? Add binary to fixed_size_binary cast. ### Are these changes tested? Yes ### Are there any user-facing changes? No * Closes: #39232 Authored-by: Jin Shang Signed-off-by: Antoine Pitrou --- .../compute/kernels/scalar_cast_string.cc | 61 ++++++++++++++++--- .../arrow/compute/kernels/scalar_cast_test.cc | 16 +++++ 2 files changed, 69 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc index ebeb597207a81..a6576e4e4c26f 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc @@ -20,11 +20,14 @@ #include "arrow/array/array_base.h" #include "arrow/array/builder_binary.h" +#include "arrow/compute/kernels/base_arithmetic_internal.h" #include "arrow/compute/kernels/codegen_internal.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/compute/kernels/scalar_cast_internal.h" #include "arrow/compute/kernels/temporal_internal.h" #include "arrow/result.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" #include "arrow/util/formatting.h" #include "arrow/util/int_util.h" #include "arrow/util/utf8_internal.h" @@ -284,9 +287,8 @@ Status CastBinaryToBinaryOffsets(KernelContext* ctx, } template -enable_if_base_binary BinaryToBinaryCastExec(KernelContext* ctx, - const ExecSpan& batch, - ExecResult* out) { +enable_if_t::value && !is_fixed_size_binary_type::value, Status> +BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { const CastOptions& options = checked_cast(*ctx->state()).options; const ArraySpan& input = batch[0].array; @@ -387,6 +389,33 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou return ZeroCopyCastExec(ctx, batch, out); } +template +enable_if_t::value && std::is_same::value, + Status> +BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + const CastOptions& options = checked_cast(*ctx->state()).options; + FixedSizeBinaryBuilder builder(options.to_type.GetSharedPtr(), ctx->memory_pool()); + const ArraySpan& input = batch[0].array; + RETURN_NOT_OK(builder.Reserve(input.length)); + + RETURN_NOT_OK(VisitArraySpanInline( + input, + [&](std::string_view v) { + if (v.size() != static_cast(builder.byte_width())) { + return Status::Invalid("Failed casting from ", input.type->ToString(), " to ", + options.to_type.ToString(), ": widths must match"); + } + builder.UnsafeAppend(v); + return Status::OK(); + }, + [&] { + builder.UnsafeAppendNull(); + return Status::OK(); + })); + + return builder.FinishInternal(&std::get>(out->value)); +} + #if defined(_MSC_VER) #pragma warning(pop) #endif @@ -452,6 +481,26 @@ void AddBinaryToBinaryCast(CastFunction* func) { AddBinaryToBinaryCast(func); } +template +void AddBinaryToFixedSizeBinaryCast(CastFunction* func) { + auto resolver_fsb = [](KernelContext* ctx, const std::vector&) { + const CastOptions& options = checked_cast(*ctx->state()).options; + return options.to_type; + }; + + DCHECK_OK(func->AddKernel(InType::type_id, {InputType(InType::type_id)}, resolver_fsb, + BinaryToBinaryCastExec, + NullHandling::COMPUTED_NO_PREALLOCATE)); +} + +void AddBinaryToFixedSizeBinaryCast(CastFunction* func) { + AddBinaryToFixedSizeBinaryCast(func); + AddBinaryToFixedSizeBinaryCast(func); + AddBinaryToFixedSizeBinaryCast(func); + AddBinaryToFixedSizeBinaryCast(func); + AddBinaryToFixedSizeBinaryCast(func); +} + } // namespace std::vector> GetBinaryLikeCasts() { @@ -483,11 +532,7 @@ std::vector> GetBinaryLikeCasts() { std::make_shared("cast_fixed_size_binary", Type::FIXED_SIZE_BINARY); AddCommonCasts(Type::FIXED_SIZE_BINARY, OutputType(ResolveOutputFromOptions), cast_fsb.get()); - DCHECK_OK(cast_fsb->AddKernel( - Type::FIXED_SIZE_BINARY, {InputType(Type::FIXED_SIZE_BINARY)}, - OutputType(FirstType), - BinaryToBinaryCastExec, - NullHandling::COMPUTED_NO_PREALLOCATE)); + AddBinaryToFixedSizeBinaryCast(cast_fsb.get()); return {cast_binary, cast_large_binary, cast_string, cast_large_string, cast_fsb}; } diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index c84125bbdd19e..b429c8175b020 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -2171,6 +2171,22 @@ TEST(Cast, StringToString) { } } +TEST(Cast, BinaryOrStringToFixedSizeBinary) { + for (auto in_type : {utf8(), large_utf8(), binary(), large_binary()}) { + auto valid_input = ArrayFromJSON(in_type, R"(["foo", null, "bar", "baz", "quu"])"); + auto invalid_input = ArrayFromJSON(in_type, R"(["foo", null, "bar", "baz", "quux"])"); + + CheckCast(valid_input, ArrayFromJSON(fixed_size_binary(3), R"(["foo", null, "bar", + "baz", "quu"])")); + CheckCastFails(invalid_input, CastOptions::Safe(fixed_size_binary(3))); + CheckCastFails(valid_input, CastOptions::Safe(fixed_size_binary(5))); + + auto empty_input = ArrayFromJSON(in_type, "[]"); + CheckCast(empty_input, ArrayFromJSON(fixed_size_binary(3), "[]")); + CheckCast(empty_input, ArrayFromJSON(fixed_size_binary(5), "[]")); + } +} + TEST(Cast, IntToString) { for (auto string_type : {utf8(), large_utf8()}) { CheckCast(ArrayFromJSON(int8(), "[0, 1, 127, -128, null]"), From e5145bff901778360f6faba3be27efa3d9522976 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 21 Dec 2023 15:00:22 -0300 Subject: [PATCH 088/570] GH-39339: [C++] Add ForceCachedHierarchicalNamespaceSupport to help with testing (#39340) ### Rationale for this change This ensures all the branches in the `AzureFileSystem` code operations are tested. For instance, many operations executed on a missing container, wouldn't get a `HNSSupport::kContainerNotFound` error if the cached `HNSSupport` was already known due to a previous operation that cached the `HNSSupport` value. ### What changes are included in this PR? Introduction of the helper that overrides `cached_hns_support_` and enumeration of the scenarios. ### Are these changes tested? Yes. This is a test improvement PR. * Closes: #39339 Authored-by: Felipe Oliveira Carvalho Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/filesystem/azurefs.cc | 36 +- cpp/src/arrow/filesystem/azurefs.h | 5 + cpp/src/arrow/filesystem/azurefs_test.cc | 453 +++++++++++++---------- 3 files changed, 291 insertions(+), 203 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index d72ead92ed111..27bdb5092a3ea 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -941,14 +941,38 @@ class AzureFileSystem::Impl { break; } ARROW_ASSIGN_OR_RAISE( - cached_hns_support_, + auto hns_support, internal::CheckIfHierarchicalNamespaceIsEnabled(adlfs_client, options_)); - DCHECK_NE(cached_hns_support_, HNSSupport::kUnknown); - // Caller should handle kContainerNotFound case appropriately. - return cached_hns_support_; + DCHECK_NE(hns_support, HNSSupport::kUnknown); + if (hns_support == HNSSupport::kContainerNotFound) { + // Caller should handle kContainerNotFound case appropriately as it knows the + // container this refers to, but the cached value in that case should remain + // kUnknown before we get a CheckIfHierarchicalNamespaceIsEnabled result that + // is not kContainerNotFound. + cached_hns_support_ = HNSSupport::kUnknown; + } else { + cached_hns_support_ = hns_support; + } + return hns_support; } public: + /// This is used from unit tests to ensure we perform operations on all the + /// possible states of cached_hns_support_. + void ForceCachedHierarchicalNamespaceSupport(int support) { + auto hns_support = static_cast(support); + switch (hns_support) { + case HNSSupport::kUnknown: + case HNSSupport::kContainerNotFound: + case HNSSupport::kDisabled: + case HNSSupport::kEnabled: + cached_hns_support_ = hns_support; + return; + } + // This is reachable if an invalid int is cast to enum class HNSSupport. + DCHECK(false) << "Invalid enum HierarchicalNamespaceSupport value."; + } + Result GetFileInfo(const AzureLocation& location) { if (location.container.empty()) { DCHECK(location.path.empty()); @@ -1560,6 +1584,10 @@ AzureFileSystem::AzureFileSystem(std::unique_ptr&& impl) default_async_is_sync_ = false; } +void AzureFileSystem::ForceCachedHierarchicalNamespaceSupport(int hns_support) { + impl_->ForceCachedHierarchicalNamespaceSupport(hns_support); +} + Result> AzureFileSystem::Make( const AzureOptions& options, const io::IOContext& io_context) { ARROW_ASSIGN_OR_RAISE(auto impl, AzureFileSystem::Impl::Make(options, io_context)); diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h index be3ca5ba238ae..69f6295237043 100644 --- a/cpp/src/arrow/filesystem/azurefs.h +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -44,6 +44,8 @@ class DataLakeServiceClient; namespace arrow::fs { +class TestAzureFileSystem; + /// Options for the AzureFileSystem implementation. struct ARROW_EXPORT AzureOptions { /// \brief hostname[:port] of the Azure Blob Storage Service. @@ -156,6 +158,9 @@ class ARROW_EXPORT AzureFileSystem : public FileSystem { explicit AzureFileSystem(std::unique_ptr&& impl); + friend class TestAzureFileSystem; + void ForceCachedHierarchicalNamespaceSupport(int hns_support); + public: ~AzureFileSystem() override = default; diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index ecf7522b98eef..3266c1bfda2dc 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -62,7 +62,6 @@ namespace arrow { using internal::TemporaryDir; namespace fs { using internal::ConcatAbstractPath; -namespace { namespace bp = boost::process; using ::testing::IsEmpty; @@ -354,7 +353,7 @@ class TestAzureFileSystem : public ::testing::Test { bool set_up_succeeded_ = false; AzureOptions options_; - std::shared_ptr fs_; + std::shared_ptr fs_dont_use_directly_; // use fs() std::unique_ptr blob_service_client_; std::unique_ptr datalake_service_client_; @@ -362,6 +361,18 @@ class TestAzureFileSystem : public ::testing::Test { TestAzureFileSystem() : rng_(std::random_device()()) {} virtual Result GetAzureEnv() const = 0; + virtual HNSSupport CachedHNSSupport(const BaseAzureEnv& env) const = 0; + + FileSystem* fs(HNSSupport cached_hns_support) const { + auto* fs_ptr = fs_dont_use_directly_.get(); + fs_ptr->ForceCachedHierarchicalNamespaceSupport(static_cast(cached_hns_support)); + return fs_ptr; + } + + FileSystem* fs() const { + EXPECT_OK_AND_ASSIGN(auto env, GetAzureEnv()); + return fs(CachedHNSSupport(*env)); + } static Result MakeOptions(BaseAzureEnv* env) { AzureOptions options; @@ -395,7 +406,7 @@ class TestAzureFileSystem : public ::testing::Test { EXPECT_OK_AND_ASSIGN(options_, options_res); } - ASSERT_OK_AND_ASSIGN(fs_, AzureFileSystem::Make(options_)); + ASSERT_OK_AND_ASSIGN(fs_dont_use_directly_, AzureFileSystem::Make(options_)); EXPECT_OK_AND_ASSIGN(blob_service_client_, options_.MakeBlobServiceClient()); EXPECT_OK_AND_ASSIGN(datalake_service_client_, options_.MakeDataLakeServiceClient()); set_up_succeeded_ = true; @@ -435,7 +446,7 @@ class TestAzureFileSystem : public ::testing::Test { void UploadLines(const std::vector& lines, const std::string& path, int total_size) { - ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(path, {})); + ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(path, {})); const auto all_lines = std::accumulate(lines.begin(), lines.end(), std::string("")); ASSERT_OK(output->Write(all_lines)); ASSERT_OK(output->Close()); @@ -461,19 +472,19 @@ class TestAzureFileSystem : public ::testing::Test { const auto sub_directory_path = ConcatAbstractPath(directory_path, "new-sub"); const auto sub_blob_path = ConcatAbstractPath(sub_directory_path, "sub.txt"); const auto top_blob_path = ConcatAbstractPath(directory_path, "top.txt"); - ASSERT_OK(fs_->CreateDir(sub_directory_path, true)); - ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(sub_blob_path)); + ASSERT_OK(fs()->CreateDir(sub_directory_path, true)); + ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(sub_blob_path)); ASSERT_OK(output->Write(std::string_view("sub"))); ASSERT_OK(output->Close()); - ASSERT_OK_AND_ASSIGN(output, fs_->OpenOutputStream(top_blob_path)); + ASSERT_OK_AND_ASSIGN(output, fs()->OpenOutputStream(top_blob_path)); ASSERT_OK(output->Write(std::string_view("top"))); ASSERT_OK(output->Close()); - AssertFileInfo(fs_.get(), data.container_name, FileType::Directory); - AssertFileInfo(fs_.get(), directory_path, FileType::Directory); - AssertFileInfo(fs_.get(), sub_directory_path, FileType::Directory); - AssertFileInfo(fs_.get(), sub_blob_path, FileType::File); - AssertFileInfo(fs_.get(), top_blob_path, FileType::File); + AssertFileInfo(fs(), data.container_name, FileType::Directory); + AssertFileInfo(fs(), directory_path, FileType::Directory); + AssertFileInfo(fs(), sub_directory_path, FileType::Directory); + AssertFileInfo(fs(), sub_blob_path, FileType::File); + AssertFileInfo(fs(), top_blob_path, FileType::File); paths->container = data.container_name; paths->directory = directory_path; @@ -538,52 +549,52 @@ class TestAzureFileSystem : public ::testing::Test { const auto directory_path = data.RandomDirectoryPath(rng_); if (WithHierarchicalNamespace()) { - ASSERT_OK(fs_->CreateDir(directory_path, true)); - arrow::fs::AssertFileInfo(fs_.get(), directory_path, FileType::Directory); - ASSERT_OK(fs_->DeleteDir(directory_path)); - arrow::fs::AssertFileInfo(fs_.get(), directory_path, FileType::NotFound); + ASSERT_OK(fs()->CreateDir(directory_path, true)); + AssertFileInfo(fs(), directory_path, FileType::Directory); + ASSERT_OK(fs()->DeleteDir(directory_path)); + AssertFileInfo(fs(), directory_path, FileType::NotFound); } else { // There is only virtual directory without hierarchical namespace // support. So the CreateDir() and DeleteDir() do nothing. - ASSERT_OK(fs_->CreateDir(directory_path)); - arrow::fs::AssertFileInfo(fs_.get(), directory_path, FileType::NotFound); - ASSERT_OK(fs_->DeleteDir(directory_path)); - arrow::fs::AssertFileInfo(fs_.get(), directory_path, FileType::NotFound); + ASSERT_OK(fs()->CreateDir(directory_path)); + AssertFileInfo(fs(), directory_path, FileType::NotFound); + ASSERT_OK(fs()->DeleteDir(directory_path)); + AssertFileInfo(fs(), directory_path, FileType::NotFound); } } void TestCreateDirSuccessContainerAndDirectory() { auto data = SetUpPreexistingData(); const auto path = data.RandomDirectoryPath(rng_); - ASSERT_OK(fs_->CreateDir(path, false)); + ASSERT_OK(fs()->CreateDir(path, false)); if (WithHierarchicalNamespace()) { - arrow::fs::AssertFileInfo(fs_.get(), path, FileType::Directory); + AssertFileInfo(fs(), path, FileType::Directory); } else { // There is only virtual directory without hierarchical namespace // support. So the CreateDir() does nothing. - arrow::fs::AssertFileInfo(fs_.get(), path, FileType::NotFound); + AssertFileInfo(fs(), path, FileType::NotFound); } } void TestCreateDirRecursiveSuccessContainerOnly() { auto container_name = PreexistingData::RandomContainerName(rng_); - ASSERT_OK(fs_->CreateDir(container_name, true)); - arrow::fs::AssertFileInfo(fs_.get(), container_name, FileType::Directory); + ASSERT_OK(fs()->CreateDir(container_name, true)); + AssertFileInfo(fs(), container_name, FileType::Directory); } void TestCreateDirRecursiveSuccessDirectoryOnly() { auto data = SetUpPreexistingData(); const auto parent = data.RandomDirectoryPath(rng_); const auto path = ConcatAbstractPath(parent, "new-sub"); - ASSERT_OK(fs_->CreateDir(path, true)); + ASSERT_OK(fs()->CreateDir(path, true)); if (WithHierarchicalNamespace()) { - arrow::fs::AssertFileInfo(fs_.get(), path, FileType::Directory); - arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::Directory); + AssertFileInfo(fs(), path, FileType::Directory); + AssertFileInfo(fs(), parent, FileType::Directory); } else { // There is only virtual directory without hierarchical namespace // support. So the CreateDir() does nothing. - arrow::fs::AssertFileInfo(fs_.get(), path, FileType::NotFound); - arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::NotFound); + AssertFileInfo(fs(), path, FileType::NotFound); + AssertFileInfo(fs(), parent, FileType::NotFound); } } @@ -591,31 +602,31 @@ class TestAzureFileSystem : public ::testing::Test { auto data = SetUpPreexistingData(); const auto parent = data.RandomDirectoryPath(rng_); const auto path = ConcatAbstractPath(parent, "new-sub"); - ASSERT_OK(fs_->CreateDir(path, true)); + ASSERT_OK(fs()->CreateDir(path, true)); if (WithHierarchicalNamespace()) { - arrow::fs::AssertFileInfo(fs_.get(), path, FileType::Directory); - arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::Directory); - arrow::fs::AssertFileInfo(fs_.get(), data.container_name, FileType::Directory); + AssertFileInfo(fs(), path, FileType::Directory); + AssertFileInfo(fs(), parent, FileType::Directory); + AssertFileInfo(fs(), data.container_name, FileType::Directory); } else { // There is only virtual directory without hierarchical namespace // support. So the CreateDir() does nothing. - arrow::fs::AssertFileInfo(fs_.get(), path, FileType::NotFound); - arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::NotFound); - arrow::fs::AssertFileInfo(fs_.get(), data.container_name, FileType::Directory); + AssertFileInfo(fs(), path, FileType::NotFound); + AssertFileInfo(fs(), parent, FileType::NotFound); + AssertFileInfo(fs(), data.container_name, FileType::Directory); } } void TestDeleteDirContentsSuccessNonexistent() { auto data = SetUpPreexistingData(); const auto directory_path = data.RandomDirectoryPath(rng_); - ASSERT_OK(fs_->DeleteDirContents(directory_path, true)); - arrow::fs::AssertFileInfo(fs_.get(), directory_path, FileType::NotFound); + ASSERT_OK(fs()->DeleteDirContents(directory_path, true)); + AssertFileInfo(fs(), directory_path, FileType::NotFound); } void TestDeleteDirContentsFailureNonexistent() { auto data = SetUpPreexistingData(); const auto directory_path = data.RandomDirectoryPath(rng_); - ASSERT_RAISES(IOError, fs_->DeleteDirContents(directory_path, false)); + ASSERT_RAISES(IOError, fs()->DeleteDirContents(directory_path, false)); } }; @@ -672,12 +683,12 @@ void TestAzureFileSystem::TestGetFileInfoObject() { .GetProperties() .Value; - AssertFileInfo(fs_.get(), data.ObjectPath(), FileType::File, + AssertFileInfo(fs(), data.ObjectPath(), FileType::File, std::chrono::system_clock::time_point{object_properties.LastModified}, static_cast(object_properties.BlobSize)); // URI - ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://" + std::string{data.kObjectName})); + ASSERT_RAISES(Invalid, fs()->GetFileInfo("abfs://" + std::string{data.kObjectName})); } void TestAzureFileSystem::TestGetFileInfoObjectWithNestedStructure() { @@ -685,37 +696,37 @@ void TestAzureFileSystem::TestGetFileInfoObjectWithNestedStructure() { // Adds detailed tests to handle cases of different edge cases // with directory naming conventions (e.g. with and without slashes). const std::string kObjectName = "test-object-dir/some_other_dir/another_dir/foo"; - ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(data.ContainerPath(kObjectName), - /*metadata=*/{})); + ASSERT_OK_AND_ASSIGN(auto output, + fs()->OpenOutputStream(data.ContainerPath(kObjectName), + /*metadata=*/{})); const std::string_view lorem_ipsum(PreexistingData::kLoremIpsum); ASSERT_OK(output->Write(lorem_ipsum)); ASSERT_OK(output->Close()); // 0 is immediately after "/" lexicographically, ensure that this doesn't // cause unexpected issues. - ASSERT_OK_AND_ASSIGN( - output, fs_->OpenOutputStream(data.ContainerPath("test-object-dir/some_other_dir0"), - /*metadata=*/{})); + ASSERT_OK_AND_ASSIGN(output, fs()->OpenOutputStream( + data.ContainerPath("test-object-dir/some_other_dir0"), + /*metadata=*/{})); ASSERT_OK(output->Write(lorem_ipsum)); ASSERT_OK(output->Close()); ASSERT_OK_AND_ASSIGN(output, - fs_->OpenOutputStream(data.ContainerPath(kObjectName + "0"), - /*metadata=*/{})); + fs()->OpenOutputStream(data.ContainerPath(kObjectName + "0"), + /*metadata=*/{})); ASSERT_OK(output->Write(lorem_ipsum)); ASSERT_OK(output->Close()); - AssertFileInfo(fs_.get(), data.ContainerPath(kObjectName), FileType::File); - AssertFileInfo(fs_.get(), data.ContainerPath(kObjectName) + "/", FileType::NotFound); - AssertFileInfo(fs_.get(), data.ContainerPath("test-object-dir"), FileType::Directory); - AssertFileInfo(fs_.get(), data.ContainerPath("test-object-dir") + "/", - FileType::Directory); - AssertFileInfo(fs_.get(), data.ContainerPath("test-object-dir/some_other_dir"), + AssertFileInfo(fs(), data.ContainerPath(kObjectName), FileType::File); + AssertFileInfo(fs(), data.ContainerPath(kObjectName) + "/", FileType::NotFound); + AssertFileInfo(fs(), data.ContainerPath("test-object-dir"), FileType::Directory); + AssertFileInfo(fs(), data.ContainerPath("test-object-dir") + "/", FileType::Directory); + AssertFileInfo(fs(), data.ContainerPath("test-object-dir/some_other_dir"), FileType::Directory); - AssertFileInfo(fs_.get(), data.ContainerPath("test-object-dir/some_other_dir") + "/", + AssertFileInfo(fs(), data.ContainerPath("test-object-dir/some_other_dir") + "/", FileType::Directory); - AssertFileInfo(fs_.get(), data.ContainerPath("test-object-di"), FileType::NotFound); - AssertFileInfo(fs_.get(), data.ContainerPath("test-object-dir/some_other_di"), + AssertFileInfo(fs(), data.ContainerPath("test-object-di"), FileType::NotFound); + AssertFileInfo(fs(), data.ContainerPath("test-object-dir/some_other_di"), FileType::NotFound); if (WithHierarchicalNamespace()) { @@ -723,17 +734,45 @@ void TestAzureFileSystem::TestGetFileInfoObjectWithNestedStructure() { .GetDirectoryClient("test-empty-object-dir") .Create(); - AssertFileInfo(fs_.get(), data.ContainerPath("test-empty-object-dir"), + AssertFileInfo(fs(), data.ContainerPath("test-empty-object-dir"), FileType::Directory); } } -template +template +struct TestingScenario { + using AzureEnvClass = AzureEnv; + static constexpr bool kHNSSupportShouldBeKnown = HNSSupportShouldBeKnown; +}; + +template class AzureFileSystemTestImpl : public TestAzureFileSystem { public: + using AzureEnvClass = typename TestingScenario::AzureEnvClass; + using TestAzureFileSystem::TestAzureFileSystem; Result GetAzureEnv() const final { return AzureEnvClass::GetInstance(); } + + /// \brief HNSSupport value that should be assumed as the cached + /// HNSSupport on every fs()->Operation(...) call in tests. + /// + /// If TestingScenario::kHNSSupportShouldBeKnown is true, this value + /// will be HNSSupport::kEnabled or HNSSupport::kDisabled, depending + /// on the environment. Otherwise, this value will be HNSSupport::kUnknown. + /// + /// This ensures all the branches in the AzureFileSystem code operations are tested. + /// For instance, many operations executed on a missing container, wouldn't + /// get a HNSSupport::kContainerNotFound error if the cached HNSSupport was + /// already known due to a previous operation that cached the HNSSupport value. + HNSSupport CachedHNSSupport(const BaseAzureEnv& env) const final { + if constexpr (TestingScenario::kHNSSupportShouldBeKnown) { + return env.WithHierarchicalNamespace() ? HNSSupport::kEnabled + : HNSSupport::kDisabled; + } else { + return HNSSupport::kUnknown; + } + } }; // How to enable the non-Azurite tests: @@ -762,54 +801,71 @@ class AzureFileSystemTestImpl : public TestAzureFileSystem { // [1]: https://azure.microsoft.com/en-gb/free/ // [2]: // https://learn.microsoft.com/en-us/azure/storage/blobs/create-data-lake-storage-account -using TestAzureFlatNSFileSystem = AzureFileSystemTestImpl; -using TestAzureHierarchicalNSFileSystem = AzureFileSystemTestImpl; -using TestAzuriteFileSystem = AzureFileSystemTestImpl; +using TestAzureFlatNSFileSystem = + AzureFileSystemTestImpl>; +using TestAzureHierarchicalNSFileSystem = + AzureFileSystemTestImpl>; +using TestAzuriteFileSystem = AzureFileSystemTestImpl>; -// Tests using all the 3 environments (Azurite, Azure w/o HNS (flat), Azure w/ HNS) - -template -using AzureFileSystemTestOnAllEnvs = AzureFileSystemTestImpl; +// Tests using all the 3 environments (Azurite, Azure w/o HNS (flat), Azure w/ HNS). +template +using TestAzureFileSystemOnAllEnvs = AzureFileSystemTestImpl; using AllEnvironments = - ::testing::Types; + ::testing::Types, TestingScenario, + TestingScenario>; -TYPED_TEST_SUITE(AzureFileSystemTestOnAllEnvs, AllEnvironments); +TYPED_TEST_SUITE(TestAzureFileSystemOnAllEnvs, AllEnvironments); -TYPED_TEST(AzureFileSystemTestOnAllEnvs, DetectHierarchicalNamespace) { +TYPED_TEST(TestAzureFileSystemOnAllEnvs, DetectHierarchicalNamespace) { this->TestDetectHierarchicalNamespace(true); this->TestDetectHierarchicalNamespace(false); } -TYPED_TEST(AzureFileSystemTestOnAllEnvs, DetectHierarchicalNamespaceOnMissingContainer) { +TYPED_TEST(TestAzureFileSystemOnAllEnvs, DetectHierarchicalNamespaceOnMissingContainer) { this->TestDetectHierarchicalNamespaceOnMissingContainer(); } -TYPED_TEST(AzureFileSystemTestOnAllEnvs, GetFileInfoObject) { +// Tests using all the 3 environments (Azurite, Azure w/o HNS (flat), Azure w/ HNS) +// combined with the two scenarios for AzureFileSystem::cached_hns_support_ -- unknown and +// known according to the environment. +template +using TestAzureFileSystemOnAllScenarios = AzureFileSystemTestImpl; + +using AllScenarios = ::testing::Types< + TestingScenario, TestingScenario, + TestingScenario, TestingScenario, + TestingScenario, + TestingScenario>; + +TYPED_TEST_SUITE(TestAzureFileSystemOnAllScenarios, AllScenarios); + +TYPED_TEST(TestAzureFileSystemOnAllScenarios, GetFileInfoObject) { this->TestGetFileInfoObject(); } -TYPED_TEST(AzureFileSystemTestOnAllEnvs, DeleteDirSuccessEmpty) { +TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteDirSuccessEmpty) { this->TestDeleteDirSuccessEmpty(); } -TYPED_TEST(AzureFileSystemTestOnAllEnvs, GetFileInfoObjectWithNestedStructure) { +TYPED_TEST(TestAzureFileSystemOnAllScenarios, GetFileInfoObjectWithNestedStructure) { this->TestGetFileInfoObjectWithNestedStructure(); } -TYPED_TEST(AzureFileSystemTestOnAllEnvs, CreateDirSuccessContainerAndDirectory) { +TYPED_TEST(TestAzureFileSystemOnAllScenarios, CreateDirSuccessContainerAndDirectory) { this->TestCreateDirSuccessContainerAndDirectory(); } -TYPED_TEST(AzureFileSystemTestOnAllEnvs, CreateDirRecursiveSuccessContainerOnly) { +TYPED_TEST(TestAzureFileSystemOnAllScenarios, CreateDirRecursiveSuccessContainerOnly) { this->TestCreateDirRecursiveSuccessContainerOnly(); } -TYPED_TEST(AzureFileSystemTestOnAllEnvs, CreateDirRecursiveSuccessDirectoryOnly) { +TYPED_TEST(TestAzureFileSystemOnAllScenarios, CreateDirRecursiveSuccessDirectoryOnly) { this->TestCreateDirRecursiveSuccessDirectoryOnly(); } -TYPED_TEST(AzureFileSystemTestOnAllEnvs, CreateDirRecursiveSuccessContainerAndDirectory) { +TYPED_TEST(TestAzureFileSystemOnAllScenarios, + CreateDirRecursiveSuccessContainerAndDirectory) { this->TestCreateDirRecursiveSuccessContainerAndDirectory(); } @@ -818,41 +874,41 @@ TYPED_TEST(AzureFileSystemTestOnAllEnvs, CreateDirRecursiveSuccessContainerAndDi TEST_F(TestAzureHierarchicalNSFileSystem, DeleteDirFailureNonexistent) { auto data = SetUpPreexistingData(); const auto path = data.RandomDirectoryPath(rng_); - ASSERT_RAISES(IOError, fs_->DeleteDir(path)); + ASSERT_RAISES(IOError, fs()->DeleteDir(path)); } TEST_F(TestAzureHierarchicalNSFileSystem, DeleteDirSuccessHaveBlob) { auto data = SetUpPreexistingData(); const auto directory_path = data.RandomDirectoryPath(rng_); const auto blob_path = ConcatAbstractPath(directory_path, "hello.txt"); - ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(blob_path)); + ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(blob_path)); ASSERT_OK(output->Write(std::string_view("hello"))); ASSERT_OK(output->Close()); - arrow::fs::AssertFileInfo(fs_.get(), blob_path, FileType::File); - ASSERT_OK(fs_->DeleteDir(directory_path)); - arrow::fs::AssertFileInfo(fs_.get(), blob_path, FileType::NotFound); + AssertFileInfo(fs(), blob_path, FileType::File); + ASSERT_OK(fs()->DeleteDir(directory_path)); + AssertFileInfo(fs(), blob_path, FileType::NotFound); } TEST_F(TestAzureHierarchicalNSFileSystem, DeleteDirSuccessHaveDirectory) { auto data = SetUpPreexistingData(); const auto parent = data.RandomDirectoryPath(rng_); const auto path = ConcatAbstractPath(parent, "new-sub"); - ASSERT_OK(fs_->CreateDir(path, true)); - arrow::fs::AssertFileInfo(fs_.get(), path, FileType::Directory); - arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::Directory); - ASSERT_OK(fs_->DeleteDir(parent)); - arrow::fs::AssertFileInfo(fs_.get(), path, FileType::NotFound); - arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::NotFound); + ASSERT_OK(fs()->CreateDir(path, true)); + AssertFileInfo(fs(), path, FileType::Directory); + AssertFileInfo(fs(), parent, FileType::Directory); + ASSERT_OK(fs()->DeleteDir(parent)); + AssertFileInfo(fs(), path, FileType::NotFound); + AssertFileInfo(fs(), parent, FileType::NotFound); } TEST_F(TestAzureHierarchicalNSFileSystem, DeleteDirContentsSuccessExist) { auto preexisting_data = SetUpPreexistingData(); HierarchicalPaths paths; CreateHierarchicalData(&paths); - ASSERT_OK(fs_->DeleteDirContents(paths.directory)); - arrow::fs::AssertFileInfo(fs_.get(), paths.directory, FileType::Directory); + ASSERT_OK(fs()->DeleteDirContents(paths.directory)); + AssertFileInfo(fs(), paths.directory, FileType::Directory); for (const auto& sub_path : paths.sub_paths) { - arrow::fs::AssertFileInfo(fs_.get(), sub_path, FileType::NotFound); + AssertFileInfo(fs(), sub_path, FileType::NotFound); } } @@ -867,20 +923,20 @@ TEST_F(TestAzureHierarchicalNSFileSystem, DeleteDirContentsFailureNonexistent) { // Tests using Azurite (the local Azure emulator) TEST_F(TestAzuriteFileSystem, GetFileInfoAccount) { - AssertFileInfo(fs_.get(), "", FileType::Directory); + AssertFileInfo(fs(), "", FileType::Directory); // URI - ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://")); + ASSERT_RAISES(Invalid, fs()->GetFileInfo("abfs://")); } TEST_F(TestAzuriteFileSystem, GetFileInfoContainer) { auto data = SetUpPreexistingData(); - AssertFileInfo(fs_.get(), data.container_name, FileType::Directory); + AssertFileInfo(fs(), data.container_name, FileType::Directory); - AssertFileInfo(fs_.get(), "nonexistent-container", FileType::NotFound); + AssertFileInfo(fs(), "nonexistent-container", FileType::NotFound); // URI - ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://" + data.container_name)); + ASSERT_RAISES(Invalid, fs()->GetFileInfo("abfs://" + data.container_name)); } TEST_F(TestAzuriteFileSystem, GetFileInfoSelector) { @@ -891,7 +947,7 @@ TEST_F(TestAzuriteFileSystem, GetFileInfoSelector) { // Root dir select.base_dir = ""; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 2); ASSERT_EQ(infos, SortedInfos(infos)); AssertFileInfo(infos[0], "container", FileType::Directory); @@ -899,18 +955,18 @@ TEST_F(TestAzuriteFileSystem, GetFileInfoSelector) { // Empty container select.base_dir = "empty-container"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 0); // Nonexistent container select.base_dir = "nonexistent-container"; - ASSERT_RAISES(IOError, fs_->GetFileInfo(select)); + ASSERT_RAISES(IOError, fs()->GetFileInfo(select)); select.allow_not_found = true; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 0); select.allow_not_found = false; // Non-empty container select.base_dir = "container"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos, SortedInfos(infos)); ASSERT_EQ(infos.size(), 4); AssertFileInfo(infos[0], "container/emptydir", FileType::Directory); @@ -920,33 +976,33 @@ TEST_F(TestAzuriteFileSystem, GetFileInfoSelector) { // Empty "directory" select.base_dir = "container/emptydir"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 0); // Non-empty "directories" select.base_dir = "container/somedir"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 1); AssertFileInfo(infos[0], "container/somedir/subdir", FileType::Directory); select.base_dir = "container/somedir/subdir"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 1); AssertFileInfo(infos[0], "container/somedir/subdir/subfile", FileType::File, 8); // Nonexistent select.base_dir = "container/nonexistent"; - ASSERT_RAISES(IOError, fs_->GetFileInfo(select)); + ASSERT_RAISES(IOError, fs()->GetFileInfo(select)); select.allow_not_found = true; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 0); select.allow_not_found = false; // Trailing slashes select.base_dir = "empty-container/"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 0); select.base_dir = "nonexistent-container/"; - ASSERT_RAISES(IOError, fs_->GetFileInfo(select)); + ASSERT_RAISES(IOError, fs()->GetFileInfo(select)); select.base_dir = "container/"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos, SortedInfos(infos)); ASSERT_EQ(infos.size(), 4); } @@ -960,19 +1016,19 @@ TEST_F(TestAzuriteFileSystem, GetFileInfoSelectorRecursive) { std::vector infos; // Root dir select.base_dir = ""; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 12); ASSERT_EQ(infos, SortedInfos(infos)); AssertInfoAllContainersRecursive(infos); // Empty container select.base_dir = "empty-container"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 0); // Non-empty container select.base_dir = "container"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos, SortedInfos(infos)); ASSERT_EQ(infos.size(), 10); AssertFileInfo(infos[0], "container/emptydir", FileType::Directory); @@ -988,19 +1044,19 @@ TEST_F(TestAzuriteFileSystem, GetFileInfoSelectorRecursive) { // Empty "directory" select.base_dir = "container/emptydir"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 0); // Non-empty "directories" select.base_dir = "container/somedir"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos, SortedInfos(infos)); ASSERT_EQ(infos.size(), 2); AssertFileInfo(infos[0], "container/somedir/subdir", FileType::Directory); AssertFileInfo(infos[1], "container/somedir/subdir/subfile", FileType::File, 8); select.base_dir = "container/otherdir"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos, SortedInfos(infos)); ASSERT_EQ(infos.size(), 4); AssertFileInfo(infos[0], "container/otherdir/1", FileType::Directory); @@ -1023,13 +1079,13 @@ TEST_F(TestAzuriteFileSystem, GetFileInfoSelectorExplicitImplicitDirDedup) { FileSelector select; // non-recursive select.base_dir = "container"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 1); ASSERT_EQ(infos, SortedInfos(infos)); AssertFileInfo(infos[0], "container/mydir", FileType::Directory); select.base_dir = "container/mydir"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 4); ASSERT_EQ(infos, SortedInfos(infos)); AssertFileInfo(infos[0], "container/mydir/emptydir1", FileType::Directory); @@ -1038,55 +1094,55 @@ TEST_F(TestAzuriteFileSystem, GetFileInfoSelectorExplicitImplicitDirDedup) { AssertFileInfo(infos[3], "container/mydir/nonemptydir2", FileType::Directory); select.base_dir = "container/mydir/emptydir1"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 0); select.base_dir = "container/mydir/emptydir2"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 0); select.base_dir = "container/mydir/nonemptydir1"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 1); AssertFileInfo(infos[0], "container/mydir/nonemptydir1/somefile", FileType::File); select.base_dir = "container/mydir/nonemptydir2"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 1); AssertFileInfo(infos[0], "container/mydir/nonemptydir2/somefile", FileType::File); } TEST_F(TestAzuriteFileSystem, CreateDirFailureNoContainer) { - ASSERT_RAISES(Invalid, fs_->CreateDir("", false)); + ASSERT_RAISES(Invalid, fs()->CreateDir("", false)); } TEST_F(TestAzuriteFileSystem, CreateDirSuccessContainerOnly) { auto container_name = PreexistingData::RandomContainerName(rng_); - ASSERT_OK(fs_->CreateDir(container_name, false)); - arrow::fs::AssertFileInfo(fs_.get(), container_name, FileType::Directory); + ASSERT_OK(fs()->CreateDir(container_name, false)); + AssertFileInfo(fs(), container_name, FileType::Directory); } TEST_F(TestAzuriteFileSystem, CreateDirFailureDirectoryWithMissingContainer) { const auto path = std::string("not-a-container/new-directory"); - ASSERT_RAISES(IOError, fs_->CreateDir(path, false)); + ASSERT_RAISES(IOError, fs()->CreateDir(path, false)); } TEST_F(TestAzuriteFileSystem, CreateDirRecursiveFailureNoContainer) { - ASSERT_RAISES(Invalid, fs_->CreateDir("", true)); + ASSERT_RAISES(Invalid, fs()->CreateDir("", true)); } TEST_F(TestAzuriteFileSystem, CreateDirUri) { ASSERT_RAISES( Invalid, - fs_->CreateDir("abfs://" + PreexistingData::RandomContainerName(rng_), true)); + fs()->CreateDir("abfs://" + PreexistingData::RandomContainerName(rng_), true)); } TEST_F(TestAzuriteFileSystem, DeleteDirSuccessContainer) { const auto container_name = PreexistingData::RandomContainerName(rng_); - ASSERT_OK(fs_->CreateDir(container_name)); - arrow::fs::AssertFileInfo(fs_.get(), container_name, FileType::Directory); - ASSERT_OK(fs_->DeleteDir(container_name)); - arrow::fs::AssertFileInfo(fs_.get(), container_name, FileType::NotFound); + ASSERT_OK(fs()->CreateDir(container_name)); + AssertFileInfo(fs(), container_name, FileType::Directory); + ASSERT_OK(fs()->DeleteDir(container_name)); + AssertFileInfo(fs(), container_name, FileType::NotFound); } TEST_F(TestAzuriteFileSystem, DeleteDirSuccessNonexistent) { @@ -1094,8 +1150,8 @@ TEST_F(TestAzuriteFileSystem, DeleteDirSuccessNonexistent) { const auto directory_path = data.RandomDirectoryPath(rng_); // There is only virtual directory without hierarchical namespace // support. So the DeleteDir() for nonexistent directory does nothing. - ASSERT_OK(fs_->DeleteDir(directory_path)); - arrow::fs::AssertFileInfo(fs_.get(), directory_path, FileType::NotFound); + ASSERT_OK(fs()->DeleteDir(directory_path)); + AssertFileInfo(fs(), directory_path, FileType::NotFound); } TEST_F(TestAzuriteFileSystem, DeleteDirSuccessHaveBlobs) { @@ -1110,21 +1166,21 @@ TEST_F(TestAzuriteFileSystem, DeleteDirSuccessHaveBlobs) { int64_t n_blobs = 257; for (int64_t i = 0; i < n_blobs; ++i) { const auto blob_path = ConcatAbstractPath(directory_path, std::to_string(i) + ".txt"); - ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(blob_path)); + ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(blob_path)); ASSERT_OK(output->Write(std::string_view(std::to_string(i)))); ASSERT_OK(output->Close()); - arrow::fs::AssertFileInfo(fs_.get(), blob_path, FileType::File); + AssertFileInfo(fs(), blob_path, FileType::File); } - ASSERT_OK(fs_->DeleteDir(directory_path)); + ASSERT_OK(fs()->DeleteDir(directory_path)); for (int64_t i = 0; i < n_blobs; ++i) { const auto blob_path = ConcatAbstractPath(directory_path, std::to_string(i) + ".txt"); - arrow::fs::AssertFileInfo(fs_.get(), blob_path, FileType::NotFound); + AssertFileInfo(fs(), blob_path, FileType::NotFound); } } TEST_F(TestAzuriteFileSystem, DeleteDirUri) { auto data = SetUpPreexistingData(); - ASSERT_RAISES(Invalid, fs_->DeleteDir("abfs://" + data.container_name + "/")); + ASSERT_RAISES(Invalid, fs()->DeleteDir("abfs://" + data.container_name + "/")); } TEST_F(TestAzuriteFileSystem, DeleteDirContentsSuccessContainer) { @@ -1135,11 +1191,11 @@ TEST_F(TestAzuriteFileSystem, DeleteDirContentsSuccessContainer) { auto data = SetUpPreexistingData(); HierarchicalPaths paths; CreateHierarchicalData(&paths); - ASSERT_OK(fs_->DeleteDirContents(paths.container)); - arrow::fs::AssertFileInfo(fs_.get(), paths.container, FileType::Directory); - arrow::fs::AssertFileInfo(fs_.get(), paths.directory, FileType::NotFound); + ASSERT_OK(fs()->DeleteDirContents(paths.container)); + AssertFileInfo(fs(), paths.container, FileType::Directory); + AssertFileInfo(fs(), paths.directory, FileType::NotFound); for (const auto& sub_path : paths.sub_paths) { - arrow::fs::AssertFileInfo(fs_.get(), sub_path, FileType::NotFound); + AssertFileInfo(fs(), sub_path, FileType::NotFound); } } @@ -1151,11 +1207,11 @@ TEST_F(TestAzuriteFileSystem, DeleteDirContentsSuccessDirectory) { auto data = SetUpPreexistingData(); HierarchicalPaths paths; CreateHierarchicalData(&paths); - ASSERT_OK(fs_->DeleteDirContents(paths.directory)); + ASSERT_OK(fs()->DeleteDirContents(paths.directory)); // GH-38772: We may change this to FileType::Directory. - arrow::fs::AssertFileInfo(fs_.get(), paths.directory, FileType::NotFound); + AssertFileInfo(fs(), paths.directory, FileType::NotFound); for (const auto& sub_path : paths.sub_paths) { - arrow::fs::AssertFileInfo(fs_.get(), sub_path, FileType::NotFound); + AssertFileInfo(fs(), sub_path, FileType::NotFound); } } @@ -1170,52 +1226,52 @@ TEST_F(TestAzuriteFileSystem, DeleteDirContentsFailureNonexistent) { TEST_F(TestAzuriteFileSystem, CopyFileSuccessDestinationNonexistent) { auto data = SetUpPreexistingData(); const auto destination_path = data.ContainerPath("copy-destionation"); - ASSERT_OK(fs_->CopyFile(data.ObjectPath(), destination_path)); - ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(destination_path)); - ASSERT_OK_AND_ASSIGN(auto stream, fs_->OpenInputStream(info)); + ASSERT_OK(fs()->CopyFile(data.ObjectPath(), destination_path)); + ASSERT_OK_AND_ASSIGN(auto info, fs()->GetFileInfo(destination_path)); + ASSERT_OK_AND_ASSIGN(auto stream, fs()->OpenInputStream(info)); ASSERT_OK_AND_ASSIGN(auto buffer, stream->Read(1024)); EXPECT_EQ(PreexistingData::kLoremIpsum, buffer->ToString()); } TEST_F(TestAzuriteFileSystem, CopyFileSuccessDestinationSame) { auto data = SetUpPreexistingData(); - ASSERT_OK(fs_->CopyFile(data.ObjectPath(), data.ObjectPath())); - ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(data.ObjectPath())); - ASSERT_OK_AND_ASSIGN(auto stream, fs_->OpenInputStream(info)); + ASSERT_OK(fs()->CopyFile(data.ObjectPath(), data.ObjectPath())); + ASSERT_OK_AND_ASSIGN(auto info, fs()->GetFileInfo(data.ObjectPath())); + ASSERT_OK_AND_ASSIGN(auto stream, fs()->OpenInputStream(info)); ASSERT_OK_AND_ASSIGN(auto buffer, stream->Read(1024)); EXPECT_EQ(PreexistingData::kLoremIpsum, buffer->ToString()); } TEST_F(TestAzuriteFileSystem, CopyFileFailureDestinationTrailingSlash) { auto data = SetUpPreexistingData(); - ASSERT_RAISES(IOError, fs_->CopyFile(data.ObjectPath(), - internal::EnsureTrailingSlash(data.ObjectPath()))); + ASSERT_RAISES(IOError, fs()->CopyFile(data.ObjectPath(), internal::EnsureTrailingSlash( + data.ObjectPath()))); } TEST_F(TestAzuriteFileSystem, CopyFileFailureSourceNonexistent) { auto data = SetUpPreexistingData(); const auto destination_path = data.ContainerPath("copy-destionation"); - ASSERT_RAISES(IOError, fs_->CopyFile(data.NotFoundObjectPath(), destination_path)); + ASSERT_RAISES(IOError, fs()->CopyFile(data.NotFoundObjectPath(), destination_path)); } TEST_F(TestAzuriteFileSystem, CopyFileFailureDestinationParentNonexistent) { auto data = SetUpPreexistingData(); const auto destination_path = ConcatAbstractPath(PreexistingData::RandomContainerName(rng_), "copy-destionation"); - ASSERT_RAISES(IOError, fs_->CopyFile(data.ObjectPath(), destination_path)); + ASSERT_RAISES(IOError, fs()->CopyFile(data.ObjectPath(), destination_path)); } TEST_F(TestAzuriteFileSystem, CopyFileUri) { auto data = SetUpPreexistingData(); const auto destination_path = data.ContainerPath("copy-destionation"); - ASSERT_RAISES(Invalid, fs_->CopyFile("abfs://" + data.ObjectPath(), destination_path)); - ASSERT_RAISES(Invalid, fs_->CopyFile(data.ObjectPath(), "abfs://" + destination_path)); + ASSERT_RAISES(Invalid, fs()->CopyFile("abfs://" + data.ObjectPath(), destination_path)); + ASSERT_RAISES(Invalid, fs()->CopyFile(data.ObjectPath(), "abfs://" + destination_path)); } TEST_F(TestAzuriteFileSystem, OpenInputStreamString) { auto data = SetUpPreexistingData(); std::shared_ptr stream; - ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(data.ObjectPath())); + ASSERT_OK_AND_ASSIGN(stream, fs()->OpenInputStream(data.ObjectPath())); ASSERT_OK_AND_ASSIGN(auto buffer, stream->Read(1024)); EXPECT_EQ(buffer->ToString(), PreexistingData::kLoremIpsum); @@ -1224,7 +1280,7 @@ TEST_F(TestAzuriteFileSystem, OpenInputStreamString) { TEST_F(TestAzuriteFileSystem, OpenInputStreamStringBuffers) { auto data = SetUpPreexistingData(); std::shared_ptr stream; - ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(data.ObjectPath())); + ASSERT_OK_AND_ASSIGN(stream, fs()->OpenInputStream(data.ObjectPath())); std::string contents; std::shared_ptr buffer; @@ -1238,10 +1294,10 @@ TEST_F(TestAzuriteFileSystem, OpenInputStreamStringBuffers) { TEST_F(TestAzuriteFileSystem, OpenInputStreamInfo) { auto data = SetUpPreexistingData(); - ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(data.ObjectPath())); + ASSERT_OK_AND_ASSIGN(auto info, fs()->GetFileInfo(data.ObjectPath())); std::shared_ptr stream; - ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(info)); + ASSERT_OK_AND_ASSIGN(stream, fs()->OpenInputStream(info)); ASSERT_OK_AND_ASSIGN(auto buffer, stream->Read(1024)); EXPECT_EQ(buffer->ToString(), PreexistingData::kLoremIpsum); @@ -1255,7 +1311,7 @@ TEST_F(TestAzuriteFileSystem, OpenInputStreamEmpty) { .GetBlockBlobClient(path_to_file) .UploadFrom(nullptr, 0); - ASSERT_OK_AND_ASSIGN(auto stream, fs_->OpenInputStream(path)); + ASSERT_OK_AND_ASSIGN(auto stream, fs()->OpenInputStream(path)); std::array buffer{}; std::int64_t size; ASSERT_OK_AND_ASSIGN(size, stream->Read(buffer.size(), buffer.data())); @@ -1264,26 +1320,26 @@ TEST_F(TestAzuriteFileSystem, OpenInputStreamEmpty) { TEST_F(TestAzuriteFileSystem, OpenInputStreamNotFound) { auto data = SetUpPreexistingData(); - ASSERT_RAISES(IOError, fs_->OpenInputStream(data.NotFoundObjectPath())); + ASSERT_RAISES(IOError, fs()->OpenInputStream(data.NotFoundObjectPath())); } TEST_F(TestAzuriteFileSystem, OpenInputStreamInfoInvalid) { auto data = SetUpPreexistingData(); - ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(data.container_name + "/")); - ASSERT_RAISES(IOError, fs_->OpenInputStream(info)); + ASSERT_OK_AND_ASSIGN(auto info, fs()->GetFileInfo(data.container_name + "/")); + ASSERT_RAISES(IOError, fs()->OpenInputStream(info)); - ASSERT_OK_AND_ASSIGN(auto info2, fs_->GetFileInfo(data.NotFoundObjectPath())); - ASSERT_RAISES(IOError, fs_->OpenInputStream(info2)); + ASSERT_OK_AND_ASSIGN(auto info2, fs()->GetFileInfo(data.NotFoundObjectPath())); + ASSERT_RAISES(IOError, fs()->OpenInputStream(info2)); } TEST_F(TestAzuriteFileSystem, OpenInputStreamUri) { auto data = SetUpPreexistingData(); - ASSERT_RAISES(Invalid, fs_->OpenInputStream("abfs://" + data.ObjectPath())); + ASSERT_RAISES(Invalid, fs()->OpenInputStream("abfs://" + data.ObjectPath())); } TEST_F(TestAzuriteFileSystem, OpenInputStreamTrailingSlash) { auto data = SetUpPreexistingData(); - ASSERT_RAISES(IOError, fs_->OpenInputStream(data.ObjectPath() + '/')); + ASSERT_RAISES(IOError, fs()->OpenInputStream(data.ObjectPath() + '/')); } namespace { @@ -1324,7 +1380,7 @@ std::shared_ptr NormalizerKeyValueMetadata( TEST_F(TestAzuriteFileSystem, OpenInputStreamReadMetadata) { auto data = SetUpPreexistingData(); std::shared_ptr stream; - ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(data.ObjectPath())); + ASSERT_OK_AND_ASSIGN(stream, fs()->OpenInputStream(data.ObjectPath())); std::shared_ptr actual; ASSERT_OK_AND_ASSIGN(actual, stream->ReadMetadata()); @@ -1354,7 +1410,7 @@ TEST_F(TestAzuriteFileSystem, OpenInputStreamReadMetadata) { TEST_F(TestAzuriteFileSystem, OpenInputStreamClosed) { auto data = SetUpPreexistingData(); - ASSERT_OK_AND_ASSIGN(auto stream, fs_->OpenInputStream(data.ObjectPath())); + ASSERT_OK_AND_ASSIGN(auto stream, fs()->OpenInputStream(data.ObjectPath())); ASSERT_OK(stream->Close()); std::array buffer{}; ASSERT_RAISES(Invalid, stream->Read(buffer.size(), buffer.data())); @@ -1399,13 +1455,13 @@ TEST_F(TestAzuriteFileSystem, WriteMetadata) { TEST_F(TestAzuriteFileSystem, OpenOutputStreamSmall) { auto data = SetUpPreexistingData(); const auto path = data.ContainerPath("test-write-object"); - ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(path, {})); + ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(path, {})); const std::string_view expected(PreexistingData::kLoremIpsum); ASSERT_OK(output->Write(expected)); ASSERT_OK(output->Close()); // Verify we can read the object back. - ASSERT_OK_AND_ASSIGN(auto input, fs_->OpenInputStream(path)); + ASSERT_OK_AND_ASSIGN(auto input, fs()->OpenInputStream(path)); std::array inbuf{}; ASSERT_OK_AND_ASSIGN(auto size, input->Read(inbuf.size(), inbuf.data())); @@ -1416,7 +1472,7 @@ TEST_F(TestAzuriteFileSystem, OpenOutputStreamSmall) { TEST_F(TestAzuriteFileSystem, OpenOutputStreamLarge) { auto data = SetUpPreexistingData(); const auto path = data.ContainerPath("test-write-object"); - ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(path, {})); + ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(path, {})); std::array sizes{257 * 1024, 258 * 1024, 259 * 1024}; std::array buffers{ std::string(sizes[0], 'A'), @@ -1432,7 +1488,7 @@ TEST_F(TestAzuriteFileSystem, OpenOutputStreamLarge) { ASSERT_OK(output->Close()); // Verify we can read the object back. - ASSERT_OK_AND_ASSIGN(auto input, fs_->OpenInputStream(path)); + ASSERT_OK_AND_ASSIGN(auto input, fs()->OpenInputStream(path)); std::string contents; std::shared_ptr buffer; @@ -1448,26 +1504,26 @@ TEST_F(TestAzuriteFileSystem, OpenOutputStreamLarge) { TEST_F(TestAzuriteFileSystem, OpenOutputStreamTruncatesExistingFile) { auto data = SetUpPreexistingData(); const auto path = data.ContainerPath("test-write-object"); - ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(path, {})); + ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(path, {})); const std::string_view expected0("Existing blob content"); ASSERT_OK(output->Write(expected0)); ASSERT_OK(output->Close()); // Check that the initial content has been written - if not this test is not achieving // what it's meant to. - ASSERT_OK_AND_ASSIGN(auto input, fs_->OpenInputStream(path)); + ASSERT_OK_AND_ASSIGN(auto input, fs()->OpenInputStream(path)); std::array inbuf{}; ASSERT_OK_AND_ASSIGN(auto size, input->Read(inbuf.size(), inbuf.data())); EXPECT_EQ(expected0, std::string_view(inbuf.data(), size)); - ASSERT_OK_AND_ASSIGN(output, fs_->OpenOutputStream(path, {})); + ASSERT_OK_AND_ASSIGN(output, fs()->OpenOutputStream(path, {})); const std::string_view expected1(PreexistingData::kLoremIpsum); ASSERT_OK(output->Write(expected1)); ASSERT_OK(output->Close()); // Verify that the initial content has been overwritten. - ASSERT_OK_AND_ASSIGN(input, fs_->OpenInputStream(path)); + ASSERT_OK_AND_ASSIGN(input, fs()->OpenInputStream(path)); ASSERT_OK_AND_ASSIGN(size, input->Read(inbuf.size(), inbuf.data())); EXPECT_EQ(expected1, std::string_view(inbuf.data(), size)); } @@ -1475,27 +1531,27 @@ TEST_F(TestAzuriteFileSystem, OpenOutputStreamTruncatesExistingFile) { TEST_F(TestAzuriteFileSystem, OpenAppendStreamDoesNotTruncateExistingFile) { auto data = SetUpPreexistingData(); const auto path = data.ContainerPath("test-write-object"); - ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(path, {})); + ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(path, {})); const std::string_view expected0("Existing blob content"); ASSERT_OK(output->Write(expected0)); ASSERT_OK(output->Close()); // Check that the initial content has been written - if not this test is not achieving // what it's meant to. - ASSERT_OK_AND_ASSIGN(auto input, fs_->OpenInputStream(path)); + ASSERT_OK_AND_ASSIGN(auto input, fs()->OpenInputStream(path)); std::array inbuf{}; ASSERT_OK_AND_ASSIGN(auto size, input->Read(inbuf.size(), inbuf.data())); EXPECT_EQ(expected0, std::string_view(inbuf.data())); - ASSERT_OK_AND_ASSIGN(output, fs_->OpenAppendStream(path, {})); + ASSERT_OK_AND_ASSIGN(output, fs()->OpenAppendStream(path, {})); const std::string_view expected1(PreexistingData::kLoremIpsum); ASSERT_OK(output->Write(expected1)); ASSERT_OK(output->Close()); // Verify that the initial content has not been overwritten and that the block from // the other client was not committed. - ASSERT_OK_AND_ASSIGN(input, fs_->OpenInputStream(path)); + ASSERT_OK_AND_ASSIGN(input, fs()->OpenInputStream(path)); ASSERT_OK_AND_ASSIGN(size, input->Read(inbuf.size(), inbuf.data())); EXPECT_EQ(std::string(inbuf.data(), size), std::string(expected0) + std::string(expected1)); @@ -1504,7 +1560,7 @@ TEST_F(TestAzuriteFileSystem, OpenAppendStreamDoesNotTruncateExistingFile) { TEST_F(TestAzuriteFileSystem, OpenOutputStreamClosed) { auto data = SetUpPreexistingData(); const auto path = data.ContainerPath("open-output-stream-closed.txt"); - ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(path, {})); + ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(path, {})); ASSERT_OK(output->Close()); ASSERT_RAISES(Invalid, output->Write(PreexistingData::kLoremIpsum, std::strlen(PreexistingData::kLoremIpsum))); @@ -1515,7 +1571,7 @@ TEST_F(TestAzuriteFileSystem, OpenOutputStreamClosed) { TEST_F(TestAzuriteFileSystem, OpenOutputStreamUri) { auto data = SetUpPreexistingData(); const auto path = data.ContainerPath("open-output-stream-uri.txt"); - ASSERT_RAISES(Invalid, fs_->OpenInputStream("abfs://" + path)); + ASSERT_RAISES(Invalid, fs()->OpenInputStream("abfs://" + path)); } TEST_F(TestAzuriteFileSystem, OpenInputFileMixedReadVsReadAt) { @@ -1534,7 +1590,7 @@ TEST_F(TestAzuriteFileSystem, OpenInputFileMixedReadVsReadAt) { UploadLines(lines, path, kLineCount * kLineWidth); std::shared_ptr file; - ASSERT_OK_AND_ASSIGN(file, fs_->OpenInputFile(path)); + ASSERT_OK_AND_ASSIGN(file, fs()->OpenInputFile(path)); for (int i = 0; i != 32; ++i) { SCOPED_TRACE("Iteration " + std::to_string(i)); // Verify sequential reads work as expected. @@ -1582,7 +1638,7 @@ TEST_F(TestAzuriteFileSystem, OpenInputFileRandomSeek) { UploadLines(lines, path, kLineCount * kLineWidth); std::shared_ptr file; - ASSERT_OK_AND_ASSIGN(file, fs_->OpenInputFile(path)); + ASSERT_OK_AND_ASSIGN(file, fs()->OpenInputFile(path)); for (int i = 0; i != 32; ++i) { SCOPED_TRACE("Iteration " + std::to_string(i)); // Verify sequential reads work as expected. @@ -1607,16 +1663,16 @@ TEST_F(TestAzuriteFileSystem, OpenInputFileIoContext) { contents.length()); std::shared_ptr file; - ASSERT_OK_AND_ASSIGN(file, fs_->OpenInputFile(path)); - EXPECT_EQ(fs_->io_context().external_id(), file->io_context().external_id()); + ASSERT_OK_AND_ASSIGN(file, fs()->OpenInputFile(path)); + EXPECT_EQ(fs()->io_context().external_id(), file->io_context().external_id()); } TEST_F(TestAzuriteFileSystem, OpenInputFileInfo) { auto data = SetUpPreexistingData(); - ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(data.ObjectPath())); + ASSERT_OK_AND_ASSIGN(auto info, fs()->GetFileInfo(data.ObjectPath())); std::shared_ptr file; - ASSERT_OK_AND_ASSIGN(file, fs_->OpenInputFile(info)); + ASSERT_OK_AND_ASSIGN(file, fs()->OpenInputFile(info)); std::array buffer{}; std::int64_t size; @@ -1629,21 +1685,21 @@ TEST_F(TestAzuriteFileSystem, OpenInputFileInfo) { TEST_F(TestAzuriteFileSystem, OpenInputFileNotFound) { auto data = SetUpPreexistingData(); - ASSERT_RAISES(IOError, fs_->OpenInputFile(data.NotFoundObjectPath())); + ASSERT_RAISES(IOError, fs()->OpenInputFile(data.NotFoundObjectPath())); } TEST_F(TestAzuriteFileSystem, OpenInputFileInfoInvalid) { auto data = SetUpPreexistingData(); - ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(data.container_name)); - ASSERT_RAISES(IOError, fs_->OpenInputFile(info)); + ASSERT_OK_AND_ASSIGN(auto info, fs()->GetFileInfo(data.container_name)); + ASSERT_RAISES(IOError, fs()->OpenInputFile(info)); - ASSERT_OK_AND_ASSIGN(auto info2, fs_->GetFileInfo(data.NotFoundObjectPath())); - ASSERT_RAISES(IOError, fs_->OpenInputFile(info2)); + ASSERT_OK_AND_ASSIGN(auto info2, fs()->GetFileInfo(data.NotFoundObjectPath())); + ASSERT_RAISES(IOError, fs()->OpenInputFile(info2)); } TEST_F(TestAzuriteFileSystem, OpenInputFileClosed) { auto data = SetUpPreexistingData(); - ASSERT_OK_AND_ASSIGN(auto stream, fs_->OpenInputFile(data.ObjectPath())); + ASSERT_OK_AND_ASSIGN(auto stream, fs()->OpenInputFile(data.ObjectPath())); ASSERT_OK(stream->Close()); std::array buffer{}; ASSERT_RAISES(Invalid, stream->Tell()); @@ -1654,6 +1710,5 @@ TEST_F(TestAzuriteFileSystem, OpenInputFileClosed) { ASSERT_RAISES(Invalid, stream->Seek(2)); } -} // namespace } // namespace fs } // namespace arrow From b70ad0b8801d9ca0634c1937df1fc02c1609548e Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Thu, 21 Dec 2023 22:00:25 +0100 Subject: [PATCH 089/570] GH-31303: [Python] Remove the legacy ParquetDataset custom python-based implementation (#39112) ### Rationale for this change Legacy ParquetDataset has been deprecated for a while now, see https://github.com/apache/arrow/issues/31529. This PR is removing the legacy implementation from the code. ### What changes are included in this PR? The PR is removing: - `ParquetDatasetPiece ` - `ParquetManifest` - `_ParquetDatasetMetadata ` - `ParquetDataset` The PR is renaming `_ParquetDatasetV2` to `ParquetDataset` which was removed. It is also updating the docstrings. The PR is updating: - `read_table` - `write_to_dataset` The PR is updating all the tests to not use `use_legacy_dataset` keyword or legacy parametrisation. ### Are these changes tested? Yes. ### Are there any user-facing changes? Deprecated code is removed. * Closes: #31303 --- docs/source/python/parquet.rst | 38 +- python/benchmarks/parquet.py | 29 - python/pyarrow/parquet/core.py | 1882 +++-------------- python/pyarrow/tests/parquet/__init__.py | 3 - python/pyarrow/tests/parquet/common.py | 39 +- python/pyarrow/tests/parquet/test_basic.py | 278 +-- .../parquet/test_compliant_nested_type.py | 19 +- .../pyarrow/tests/parquet/test_data_types.py | 94 +- python/pyarrow/tests/parquet/test_dataset.py | 926 ++------ python/pyarrow/tests/parquet/test_datetime.py | 14 +- python/pyarrow/tests/parquet/test_pandas.py | 192 +- .../tests/parquet/test_parquet_file.py | 25 +- .../tests/parquet/test_parquet_writer.py | 27 +- python/pyarrow/tests/test_dataset.py | 68 +- python/pyarrow/tests/test_hdfs.py | 25 +- 15 files changed, 630 insertions(+), 3029 deletions(-) diff --git a/docs/source/python/parquet.rst b/docs/source/python/parquet.rst index 85a9674a689ca..d4717897660b6 100644 --- a/docs/source/python/parquet.rst +++ b/docs/source/python/parquet.rst @@ -511,36 +511,20 @@ from a remote filesystem into a pandas dataframe you may need to run ``sort_index`` to maintain row ordering (as long as the ``preserve_index`` option was enabled on write). -.. note:: - - The ParquetDataset is being reimplemented based on the new generic Dataset - API (see the :ref:`dataset` docs for an overview). This is not yet the - default, but can already be enabled by passing the ``use_legacy_dataset=False`` - keyword to :class:`ParquetDataset` or :func:`read_table`:: - - pq.ParquetDataset('dataset_name/', use_legacy_dataset=False) - - Enabling this gives the following new features: - - - Filtering on all columns (using row group statistics) instead of only on - the partition keys. - - More fine-grained partitioning: support for a directory partitioning scheme - in addition to the Hive-like partitioning (e.g. "/2019/11/15/" instead of - "/year=2019/month=11/day=15/"), and the ability to specify a schema for - the partition keys. - - General performance improvement and bug fixes. +Other features: - It also has the following changes in behaviour: +- Filtering on all columns (using row group statistics) instead of only on + the partition keys. +- Fine-grained partitioning: support for a directory partitioning scheme + in addition to the Hive-like partitioning (e.g. "/2019/11/15/" instead of + "/year=2019/month=11/day=15/"), and the ability to specify a schema for + the partition keys. - - The partition keys need to be explicitly included in the ``columns`` - keyword when you want to include them in the result while reading a - subset of the columns +Note: - This new implementation is already enabled in ``read_table``, and in the - future, this will be turned on by default for ``ParquetDataset``. The new - implementation does not yet cover all existing ParquetDataset features (e.g. - specifying the ``metadata``, or the ``pieces`` property API). Feedback is - very welcome. +- The partition keys need to be explicitly included in the ``columns`` + keyword when you want to include them in the result while reading a + subset of the columns Using with Spark diff --git a/python/benchmarks/parquet.py b/python/benchmarks/parquet.py index 3aeca425bc8f0..e459ea2c369b4 100644 --- a/python/benchmarks/parquet.py +++ b/python/benchmarks/parquet.py @@ -29,35 +29,6 @@ pq = None -class ParquetManifestCreation(object): - """Benchmark creating a parquet manifest.""" - - size = 10 ** 6 - tmpdir = None - - param_names = ('num_partitions', 'num_threads') - params = [(10, 100, 1000), (1, 8)] - - def setup(self, num_partitions, num_threads): - if pq is None: - raise NotImplementedError("Parquet support not enabled") - - self.tmpdir = tempfile.mkdtemp('benchmark_parquet') - rnd = np.random.RandomState(42) - num1 = rnd.randint(0, num_partitions, size=self.size) - num2 = rnd.randint(0, 1000, size=self.size) - output_df = pd.DataFrame({'num1': num1, 'num2': num2}) - output_table = pa.Table.from_pandas(output_df) - pq.write_to_dataset(output_table, self.tmpdir, ['num1']) - - def teardown(self, num_partitions, num_threads): - if self.tmpdir is not None: - shutil.rmtree(self.tmpdir) - - def time_manifest_creation(self, num_partitions, num_threads): - pq.ParquetManifest(self.tmpdir, metadata_nthreads=num_threads) - - class ParquetWriteBinary(object): def setup(self): diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 852b339211b0d..98a4b2a1138c7 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -17,22 +17,17 @@ from collections import defaultdict -from concurrent import futures from contextlib import nullcontext -from functools import partial, reduce +from functools import reduce import inspect import json -from collections.abc import Collection -import numpy as np import os import re import operator -import urllib.parse import warnings import pyarrow as pa -import pyarrow.lib as lib try: import pyarrow._parquet as _parquet @@ -55,28 +50,6 @@ from pyarrow import filesystem as legacyfs from pyarrow.util import guid, _is_path_like, _stringify_path, _deprecate_api -_URI_STRIP_SCHEMES = ('hdfs',) - - -def _parse_uri(path): - path = _stringify_path(path) - parsed_uri = urllib.parse.urlparse(path) - if parsed_uri.scheme in _URI_STRIP_SCHEMES: - return parsed_uri.path - else: - # ARROW-4073: On Windows returning the path with the scheme - # stripped removes the drive letter, if any - return path - - -def _get_filesystem_and_path(passed_filesystem, path): - if passed_filesystem is None: - return legacyfs.resolve_filesystem_and_path(path, passed_filesystem) - else: - passed_filesystem = legacyfs._ensure_filesystem(passed_filesystem) - parsed_path = _parse_uri(path) - return passed_filesystem, parsed_path - def _check_contains_null(val): if isinstance(val, bytes): @@ -1148,516 +1121,15 @@ def _get_pandas_index_columns(keyvalues): ['index_columns']) -# ---------------------------------------------------------------------- -# Metadata container providing instructions about reading a single Parquet -# file, possibly part of a partitioned dataset - - -class ParquetDatasetPiece: - """ - DEPRECATED: A single chunk of a potentially larger Parquet dataset to read. - - The arguments will indicate to read either a single row group or all row - groups, and whether to add partition keys to the resulting pyarrow.Table. - - .. deprecated:: 5.0 - Directly constructing a ``ParquetDatasetPiece`` is deprecated, as well - as accessing the pieces of a ``ParquetDataset`` object. Specify - ``use_legacy_dataset=False`` when constructing the ``ParquetDataset`` - and use the ``ParquetDataset.fragments`` attribute instead. - - Parameters - ---------- - path : str or pathlib.Path - Path to file in the file system where this piece is located. - open_file_func : callable - Function to use for obtaining file handle to dataset piece. - file_options : dict - Options - row_group : int, default None - Row group to load. By default, reads all row groups. - partition_keys : list of tuples - Two-element tuples of ``(column name, ordinal index)``. - """ - - def __init__(self, path, open_file_func=partial(open, mode='rb'), - file_options=None, row_group=None, partition_keys=None): - warnings.warn( - "ParquetDatasetPiece is deprecated as of pyarrow 5.0.0 and will " - "be removed in a future version.", - FutureWarning, stacklevel=2) - self._init( - path, open_file_func, file_options, row_group, partition_keys) - - @staticmethod - def _create(path, open_file_func=partial(open, mode='rb'), - file_options=None, row_group=None, partition_keys=None): - self = ParquetDatasetPiece.__new__(ParquetDatasetPiece) - self._init( - path, open_file_func, file_options, row_group, partition_keys) - return self - - def _init(self, path, open_file_func, file_options, row_group, - partition_keys): - self.path = _stringify_path(path) - self.open_file_func = open_file_func - self.row_group = row_group - self.partition_keys = partition_keys or [] - self.file_options = file_options or {} - - def __eq__(self, other): - if not isinstance(other, ParquetDatasetPiece): - return False - return (self.path == other.path and - self.row_group == other.row_group and - self.partition_keys == other.partition_keys) - - def __repr__(self): - return ('{}({!r}, row_group={!r}, partition_keys={!r})' - .format(type(self).__name__, self.path, - self.row_group, - self.partition_keys)) - - def __str__(self): - result = '' - - if len(self.partition_keys) > 0: - partition_str = ', '.join('{}={}'.format(name, index) - for name, index in self.partition_keys) - result += 'partition[{}] '.format(partition_str) - - result += self.path - - if self.row_group is not None: - result += ' | row_group={}'.format(self.row_group) - - return result - - def get_metadata(self): - """ - Return the file's metadata. - - Returns - ------- - metadata : FileMetaData - The file's metadata - """ - with self.open() as parquet: - return parquet.metadata - - def open(self): - """ - Return instance of ParquetFile. - """ - reader = self.open_file_func(self.path) - if not isinstance(reader, ParquetFile): - reader = ParquetFile(reader, **self.file_options) - - # ensure reader knows it's responsible for closing source - # since we opened the source here internally. - reader._close_source = True - return reader - - def read(self, columns=None, use_threads=True, partitions=None, - file=None, use_pandas_metadata=False): - """ - Read this piece as a pyarrow.Table. - - Parameters - ---------- - columns : list of column names, default None - use_threads : bool, default True - Perform multi-threaded column reads. - partitions : ParquetPartitions, default None - file : file-like object - Passed to ParquetFile. - use_pandas_metadata : bool - If pandas metadata should be used or not. - - Returns - ------- - table : pyarrow.Table - The piece as a pyarrow.Table. - """ - if self.open_file_func is not None: - reader = self.open() - elif file is not None: - reader = ParquetFile(file, **self.file_options) - else: - # try to read the local path - reader = ParquetFile(self.path, **self.file_options) - - options = dict(columns=columns, - use_threads=use_threads, - use_pandas_metadata=use_pandas_metadata) - - if self.row_group is not None: - table = reader.read_row_group(self.row_group, **options) - else: - table = reader.read(**options) - - if len(self.partition_keys) > 0: - if partitions is None: - raise ValueError('Must pass partition sets') - - # Here, the index is the categorical code of the partition where - # this piece is located. Suppose we had - # - # /foo=a/0.parq - # /foo=b/0.parq - # /foo=c/0.parq - # - # Then we assign a=0, b=1, c=2. And the resulting Table pieces will - # have a DictionaryArray column named foo having the constant index - # value as indicated. The distinct categories of the partition have - # been computed in the ParquetManifest - for i, (name, index) in enumerate(self.partition_keys): - # The partition code is the same for all values in this piece - indices = np.full(len(table), index, dtype='i4') - - # This is set of all partition values, computed as part of the - # manifest, so ['a', 'b', 'c'] as in our example above. - dictionary = partitions.levels[i].dictionary - - arr = pa.DictionaryArray.from_arrays(indices, dictionary) - table = table.append_column(name, arr) - - # To ParquetFile the source looked like it was already open, so won't - # actually close it without overriding. - reader.close(force=True) - return table - - -class PartitionSet: - """ - A data structure for cataloguing the observed Parquet partitions at a - particular level. So if we have - - /foo=a/bar=0 - /foo=a/bar=1 - /foo=a/bar=2 - /foo=b/bar=0 - /foo=b/bar=1 - /foo=b/bar=2 - - Then we have two partition sets, one for foo, another for bar. As we visit - levels of the partition hierarchy, a PartitionSet tracks the distinct - values and assigns categorical codes to use when reading the pieces - - Parameters - ---------- - name : str - Name of the partition set. Under which key to collect all values. - keys : list - All possible values that have been collected for that partition set. - """ - - def __init__(self, name, keys=None): - self.name = name - self.keys = keys or [] - self.key_indices = {k: i for i, k in enumerate(self.keys)} - self._dictionary = None - - def get_index(self, key): - """ - Get the index of the partition value if it is known, otherwise assign - one - - Parameters - ---------- - key : str or int - The value for which we want to known the index. - """ - if key in self.key_indices: - return self.key_indices[key] - else: - index = len(self.key_indices) - self.keys.append(key) - self.key_indices[key] = index - return index - - @property - def dictionary(self): - if self._dictionary is not None: - return self._dictionary - - if len(self.keys) == 0: - raise ValueError('No known partition keys') - - # Only integer and string partition types are supported right now - try: - integer_keys = [int(x) for x in self.keys] - dictionary = lib.array(integer_keys) - except ValueError: - dictionary = lib.array(self.keys) - - self._dictionary = dictionary - return dictionary - - @property - def is_sorted(self): - return list(self.keys) == sorted(self.keys) - - -class ParquetPartitions: - - def __init__(self): - self.levels = [] - self.partition_names = set() - - def __len__(self): - return len(self.levels) - - def __getitem__(self, i): - return self.levels[i] - - def equals(self, other): - if not isinstance(other, ParquetPartitions): - raise TypeError('`other` must be an instance of ParquetPartitions') - - return (self.levels == other.levels and - self.partition_names == other.partition_names) - - def __eq__(self, other): - try: - return self.equals(other) - except TypeError: - return NotImplemented - - def get_index(self, level, name, key): - """ - Record a partition value at a particular level, returning the distinct - code for that value at that level. - - Examples - -------- - - partitions.get_index(1, 'foo', 'a') returns 0 - partitions.get_index(1, 'foo', 'b') returns 1 - partitions.get_index(1, 'foo', 'c') returns 2 - partitions.get_index(1, 'foo', 'a') returns 0 - - Parameters - ---------- - level : int - The nesting level of the partition we are observing - name : str - The partition name - key : str or int - The partition value - """ - if level == len(self.levels): - if name in self.partition_names: - raise ValueError('{} was the name of the partition in ' - 'another level'.format(name)) - - part_set = PartitionSet(name) - self.levels.append(part_set) - self.partition_names.add(name) - - return self.levels[level].get_index(key) - - def filter_accepts_partition(self, part_key, filter, level): - p_column, p_value_index = part_key - f_column, op, f_value = filter - if p_column != f_column: - return True - - f_type = type(f_value) - - if op in {'in', 'not in'}: - if not isinstance(f_value, Collection): - raise TypeError( - "'%s' object is not a collection", f_type.__name__) - if not f_value: - raise ValueError("Cannot use empty collection as filter value") - if len({type(item) for item in f_value}) != 1: - raise ValueError("All elements of the collection '%s' must be" - " of same type", f_value) - f_type = type(next(iter(f_value))) - - elif not isinstance(f_value, str) and isinstance(f_value, Collection): - raise ValueError( - "Op '%s' not supported with a collection value", op) - - p_value = f_type(self.levels[level] - .dictionary[p_value_index].as_py()) - - if op == "=" or op == "==": - return p_value == f_value - elif op == "!=": - return p_value != f_value - elif op == '<': - return p_value < f_value - elif op == '>': - return p_value > f_value - elif op == '<=': - return p_value <= f_value - elif op == '>=': - return p_value >= f_value - elif op == 'in': - return p_value in f_value - elif op == 'not in': - return p_value not in f_value - else: - raise ValueError("'%s' is not a valid operator in predicates.", - filter[1]) - - -class ParquetManifest: - - def __init__(self, dirpath, open_file_func=None, filesystem=None, - pathsep='/', partition_scheme='hive', metadata_nthreads=1): - filesystem, dirpath = _get_filesystem_and_path(filesystem, dirpath) - self.filesystem = filesystem - self.open_file_func = open_file_func - self.pathsep = pathsep - self.dirpath = _stringify_path(dirpath) - self.partition_scheme = partition_scheme - self.partitions = ParquetPartitions() - self.pieces = [] - self._metadata_nthreads = metadata_nthreads - self._thread_pool = futures.ThreadPoolExecutor( - max_workers=metadata_nthreads) - - self.common_metadata_path = None - self.metadata_path = None - - self._visit_level(0, self.dirpath, []) - - # Due to concurrency, pieces will potentially by out of order if the - # dataset is partitioned so we sort them to yield stable results - self.pieces.sort(key=lambda piece: piece.path) - - if self.common_metadata_path is None: - # _common_metadata is a subset of _metadata - self.common_metadata_path = self.metadata_path - - self._thread_pool.shutdown() - - def _visit_level(self, level, base_path, part_keys): - fs = self.filesystem - - _, directories, files = next(fs.walk(base_path)) - - filtered_files = [] - for path in files: - full_path = self.pathsep.join((base_path, path)) - if path.endswith('_common_metadata'): - self.common_metadata_path = full_path - elif path.endswith('_metadata'): - self.metadata_path = full_path - elif self._should_silently_exclude(path): - continue - else: - filtered_files.append(full_path) - - # ARROW-1079: Filter out "private" directories starting with underscore - filtered_directories = [self.pathsep.join((base_path, x)) - for x in directories - if not _is_private_directory(x)] - - filtered_files.sort() - filtered_directories.sort() - - if len(filtered_files) > 0 and len(filtered_directories) > 0: - raise ValueError('Found files in an intermediate ' - 'directory: {}'.format(base_path)) - elif len(filtered_directories) > 0: - self._visit_directories(level, filtered_directories, part_keys) - else: - self._push_pieces(filtered_files, part_keys) - - def _should_silently_exclude(self, file_name): - return (file_name.endswith('.crc') or # Checksums - file_name.endswith('_$folder$') or # HDFS directories in S3 - file_name.startswith('.') or # Hidden files starting with . - file_name.startswith('_') or # Hidden files starting with _ - file_name in EXCLUDED_PARQUET_PATHS) - - def _visit_directories(self, level, directories, part_keys): - futures_list = [] - for path in directories: - head, tail = _path_split(path, self.pathsep) - name, key = _parse_hive_partition(tail) - - index = self.partitions.get_index(level, name, key) - dir_part_keys = part_keys + [(name, index)] - # If you have less threads than levels, the wait call will block - # indefinitely due to multiple waits within a thread. - if level < self._metadata_nthreads: - future = self._thread_pool.submit(self._visit_level, - level + 1, - path, - dir_part_keys) - futures_list.append(future) - else: - self._visit_level(level + 1, path, dir_part_keys) - if futures_list: - futures.wait(futures_list) - - def _parse_partition(self, dirname): - if self.partition_scheme == 'hive': - return _parse_hive_partition(dirname) - else: - raise NotImplementedError('partition schema: {}' - .format(self.partition_scheme)) - - def _push_pieces(self, files, part_keys): - self.pieces.extend([ - ParquetDatasetPiece._create(path, partition_keys=part_keys, - open_file_func=self.open_file_func) - for path in files - ]) - - -def _parse_hive_partition(value): - if '=' not in value: - raise ValueError('Directory name did not appear to be a ' - 'partition: {}'.format(value)) - return value.split('=', 1) - - -def _is_private_directory(x): - _, tail = os.path.split(x) - return (tail.startswith('_') or tail.startswith('.')) and '=' not in tail - - -def _path_split(path, sep): - i = path.rfind(sep) + 1 - head, tail = path[:i], path[i:] - head = head.rstrip(sep) - return head, tail - - EXCLUDED_PARQUET_PATHS = {'_SUCCESS'} -class _ParquetDatasetMetadata: - __slots__ = ('fs', 'memory_map', 'read_dictionary', 'common_metadata', - 'buffer_size') - - -def _open_dataset_file(dataset, path, meta=None): - if (dataset.fs is not None and - not isinstance(dataset.fs, legacyfs.LocalFileSystem)): - path = dataset.fs.open(path, mode='rb') - return ParquetFile( - path, - metadata=meta, - memory_map=dataset.memory_map, - read_dictionary=dataset.read_dictionary, - common_metadata=dataset.common_metadata, - buffer_size=dataset.buffer_size +def _is_local_file_system(fs): + return isinstance(fs, LocalFileSystem) or isinstance( + fs, legacyfs.LocalFileSystem ) -_DEPR_MSG = ( - "'{}' attribute is deprecated as of pyarrow 5.0.0 and will be removed " - "in a future version.{}" -) - - _read_docstring_common = """\ read_dictionary : list, default None List of names or column paths (for nested types) to read directly @@ -1680,6 +1152,7 @@ def _open_dataset_file(dataset, path, meta=None): you need to specify the field names or a full schema. See the ``pyarrow.dataset.partitioning()`` function for more details.""" + _parquet_dataset_example = """\ Generate an example PyArrow Table and write it to a partitioned dataset: @@ -1688,15 +1161,13 @@ def _open_dataset_file(dataset, path, meta=None): ... 'n_legs': [2, 2, 4, 4, 5, 100], ... 'animal': ["Flamingo", "Parrot", "Dog", "Horse", ... "Brittle stars", "Centipede"]}) - >>> import pyarrow.parquet as pq ->>> pq.write_to_dataset(table, root_path='dataset_name', -... partition_cols=['year'], -... use_legacy_dataset=False) +>>> pq.write_to_dataset(table, root_path='dataset_v2', +... partition_cols=['year']) create a ParquetDataset object from the dataset source: ->>> dataset = pq.ParquetDataset('dataset_name/', use_legacy_dataset=False) +>>> dataset = pq.ParquetDataset('dataset_v2/') and read the data: @@ -1711,7 +1182,7 @@ def _open_dataset_file(dataset, path, meta=None): create a ParquetDataset object with filter: ->>> dataset = pq.ParquetDataset('dataset_name/', use_legacy_dataset=False, +>>> dataset = pq.ParquetDataset('dataset_v2/', ... filters=[('n_legs','=',4)]) >>> dataset.read().to_pandas() n_legs animal year @@ -1721,7 +1192,6 @@ def _open_dataset_file(dataset, path, meta=None): class ParquetDataset: - __doc__ = """ Encapsulates details of reading a complete Parquet dataset possibly consisting of multiple files and partitions in subdirectories. @@ -1735,39 +1205,26 @@ class ParquetDataset: Path will try to be found in the local on-disk filesystem otherwise it will be parsed as an URI to determine the filesystem. schema : pyarrow.parquet.Schema - Use schema obtained elsewhere to validate file schemas. Alternative to - metadata parameter. -metadata : pyarrow.parquet.FileMetaData - Use metadata obtained elsewhere to validate file schemas. -split_row_groups : bool, default False - Divide files into pieces for each row group in the file. -validate_schema : bool, default True - Check that individual file schemas are all the same / compatible. + Optionally provide the Schema for the Dataset, in which case it will + not be inferred from the source. filters : pyarrow.compute.Expression or List[Tuple] or List[List[Tuple]], default None Rows which do not match the filter predicate will be removed from scanned data. Partition keys embedded in a nested directory structure will be exploited to avoid loading files at all if they contain no matching rows. - If `use_legacy_dataset` is True, filters can only reference partition - keys and only a hive-style directory structure is supported. When - setting `use_legacy_dataset` to False, also within-file level filtering - and different partitioning schemes are supported. + Within-file level filtering and different partitioning schemes are supported. {1} -metadata_nthreads : int, default 1 - How many threads to allow the thread pool which is used to read the - dataset metadata. Increasing this is helpful to read partitioned - datasets. {0} -use_legacy_dataset : bool, default False - Set to False to enable the new code path (using the - new Arrow Dataset API). Among other things, this allows to pass - `filters` for all columns and not only the partition keys, enables - different partitioning schemes, etc. +ignore_prefixes : list, optional + Files matching any of these prefixes will be ignored by the + discovery process. + This is matched to the basename of a path. + By default this is ['.', '_']. + Note that discovery happens only if a directory is passed as source. pre_buffer : bool, default True Coalesce and issue file reads in parallel to improve performance on high-latency filesystems (e.g. S3, GCS). If True, Arrow will use a - background I/O thread pool. This option is only supported for - use_legacy_dataset=False. If using a filesystem layer that itself + background I/O thread pool. If using a filesystem layer that itself performs readahead (e.g. fsspec's S3FS), disable readahead for best results. Set to False if you want to prioritize minimal memory usage over maximum speed. @@ -1775,6 +1232,10 @@ class ParquetDataset: Cast timestamps that are stored in INT96 format to a particular resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' and therefore INT96 timestamps will be inferred as timestamps in nanoseconds. +decryption_properties : FileDecryptionProperties or None + File-level decryption properties. + The decryption properties can be created using + ``CryptoFactory.file_decryption_properties()``. thrift_string_size_limit : int, default None If not None, override the maximum total string size allocated when decoding Thrift structures. The default limit should be @@ -1785,739 +1246,95 @@ class ParquetDataset: sufficient for most Parquet files. page_checksum_verification : bool, default False If True, verify the page checksum for each page read from the file. +use_legacy_dataset : bool, optional + Deprecated and has no effect from PyArrow version 15.0.0. Examples -------- {2} """.format(_read_docstring_common, _DNF_filter_doc, _parquet_dataset_example) - def __new__(cls, path_or_paths=None, filesystem=None, schema=None, - metadata=None, split_row_groups=False, validate_schema=True, - filters=None, metadata_nthreads=None, read_dictionary=None, - memory_map=False, buffer_size=0, partitioning="hive", - use_legacy_dataset=None, pre_buffer=True, - coerce_int96_timestamp_unit=None, - thrift_string_size_limit=None, - thrift_container_size_limit=None, - page_checksum_verification=False): - - extra_msg = "" - if use_legacy_dataset is None: - # if an old filesystem is passed -> still use to old implementation - if isinstance(filesystem, legacyfs.FileSystem): - use_legacy_dataset = True - extra_msg = ( - " The legacy behaviour was still chosen because a " - "deprecated 'pyarrow.filesystem' filesystem was specified " - "(use the filesystems from pyarrow.fs instead)." - ) - # otherwise the default is already False - else: - use_legacy_dataset = False - - if not use_legacy_dataset: - return _ParquetDatasetV2( - path_or_paths, filesystem=filesystem, - filters=filters, - partitioning=partitioning, - read_dictionary=read_dictionary, - memory_map=memory_map, - buffer_size=buffer_size, - pre_buffer=pre_buffer, - coerce_int96_timestamp_unit=coerce_int96_timestamp_unit, - # unsupported keywords - schema=schema, metadata=metadata, - split_row_groups=split_row_groups, - validate_schema=validate_schema, - metadata_nthreads=metadata_nthreads, - thrift_string_size_limit=thrift_string_size_limit, - thrift_container_size_limit=thrift_container_size_limit, - page_checksum_verification=page_checksum_verification, - ) - warnings.warn( - "Passing 'use_legacy_dataset=True' to get the legacy behaviour is " - "deprecated as of pyarrow 11.0.0, and the legacy implementation " - "will be removed in a future version." + extra_msg, - FutureWarning, stacklevel=2) - self = object.__new__(cls) - return self - - def __init__(self, path_or_paths, filesystem=None, schema=None, - metadata=None, split_row_groups=False, validate_schema=True, - filters=None, metadata_nthreads=None, read_dictionary=None, - memory_map=False, buffer_size=0, partitioning="hive", - use_legacy_dataset=None, pre_buffer=True, + def __init__(self, path_or_paths, filesystem=None, schema=None, *, filters=None, + read_dictionary=None, memory_map=False, buffer_size=None, + partitioning="hive", ignore_prefixes=None, pre_buffer=True, coerce_int96_timestamp_unit=None, - thrift_string_size_limit=None, + decryption_properties=None, thrift_string_size_limit=None, thrift_container_size_limit=None, - page_checksum_verification=False): - if partitioning != "hive": - raise ValueError( - 'Only "hive" for hive-like partitioning is supported when ' - 'using use_legacy_dataset=True') - if metadata_nthreads is not None: - warnings.warn( - "Specifying the 'metadata_nthreads' argument is deprecated as " - "of pyarrow 8.0.0, and the argument will be removed in a " - "future version", - FutureWarning, stacklevel=2, - ) - else: - metadata_nthreads = 1 - - self._ds_metadata = _ParquetDatasetMetadata() - a_path = path_or_paths - if isinstance(a_path, list): - a_path = a_path[0] - - self._ds_metadata.fs, _ = _get_filesystem_and_path(filesystem, a_path) - if isinstance(path_or_paths, list): - self.paths = [_parse_uri(path) for path in path_or_paths] - else: - self.paths = _parse_uri(path_or_paths) - - self._ds_metadata.read_dictionary = read_dictionary - self._ds_metadata.memory_map = memory_map - self._ds_metadata.buffer_size = buffer_size - - (self._pieces, - self._partitions, - self._common_metadata_path, - self._metadata_path) = _make_manifest( - path_or_paths, self._fs, metadata_nthreads=metadata_nthreads, - open_file_func=partial(_open_dataset_file, self._ds_metadata) - ) - - if self._common_metadata_path is not None: - with self._fs.open(self._common_metadata_path) as f: - self._ds_metadata.common_metadata = read_metadata( - f, - memory_map=memory_map - ) - else: - self._ds_metadata.common_metadata = None + page_checksum_verification=False, + use_legacy_dataset=None): - if metadata is not None: + if use_legacy_dataset is not None: warnings.warn( - "Specifying the 'metadata' argument with 'use_legacy_dataset=" - "True' is deprecated as of pyarrow 8.0.0.", + "Passing 'use_legacy_dataset' is deprecated as of pyarrow 15.0.0 " + "and will be removed in a future version.", FutureWarning, stacklevel=2) - if metadata is None and self._metadata_path is not None: - with self._fs.open(self._metadata_path) as f: - self._metadata = read_metadata(f, memory_map=memory_map) - else: - self._metadata = metadata - - if schema is not None: - warnings.warn( - "Specifying the 'schema' argument with 'use_legacy_dataset=" - "True' is deprecated as of pyarrow 8.0.0. You can still " - "specify it in combination with 'use_legacy_dataset=False', " - "but in that case you need to specify a pyarrow.Schema " - "instead of a ParquetSchema.", - FutureWarning, stacklevel=2) - self._schema = schema + import pyarrow.dataset as ds - self.split_row_groups = split_row_groups + # map format arguments + read_options = { + "pre_buffer": pre_buffer, + "coerce_int96_timestamp_unit": coerce_int96_timestamp_unit, + "thrift_string_size_limit": thrift_string_size_limit, + "thrift_container_size_limit": thrift_container_size_limit, + "page_checksum_verification": page_checksum_verification, + } + if buffer_size: + read_options.update(use_buffered_stream=True, + buffer_size=buffer_size) + if read_dictionary is not None: + read_options.update(dictionary_columns=read_dictionary) - if split_row_groups: - raise NotImplementedError("split_row_groups not yet implemented") + if decryption_properties is not None: + read_options.update(decryption_properties=decryption_properties) + self._filter_expression = None if filters is not None: - if hasattr(filters, "cast"): - raise TypeError( - "Expressions as filter not supported for legacy dataset") - filters = _check_filters(filters) - self._filter(filters) - - if validate_schema: - self.validate_schemas() - - def __getnewargs_ex__(self): - # when creating a new instance while unpickling, force to use the - # legacy code path to create a ParquetDataset instance - # instead of a _ParquetDatasetV2 instance - return ((), dict(use_legacy_dataset=True)) - - def equals(self, other): - if not isinstance(other, ParquetDataset): - raise TypeError('`other` must be an instance of ParquetDataset') + self._filter_expression = filters_to_expression(filters) - if self._fs.__class__ != other._fs.__class__: - return False - for prop in ('paths', '_pieces', '_partitions', - '_common_metadata_path', '_metadata_path', - '_common_metadata', '_metadata', '_schema', - 'split_row_groups'): - if getattr(self, prop) != getattr(other, prop): - return False - for prop in ('memory_map', 'buffer_size'): - if ( - getattr(self._ds_metadata, prop) != - getattr(other._ds_metadata, prop) - ): - return False - - return True + # map old filesystems to new one + if filesystem is not None: + filesystem = _ensure_filesystem( + filesystem, use_mmap=memory_map) + elif filesystem is None and memory_map: + # if memory_map is specified, assume local file system (string + # path can in principle be URI for any filesystem) + filesystem = LocalFileSystem(use_mmap=memory_map) - def __eq__(self, other): - try: - return self.equals(other) - except TypeError: - return NotImplemented + # This needs to be checked after _ensure_filesystem, because that + # handles the case of an fsspec LocalFileSystem + if ( + hasattr(path_or_paths, "__fspath__") and + filesystem is not None and + not _is_local_file_system(filesystem) + ): + raise TypeError( + "Path-like objects with __fspath__ must only be used with " + f"local file systems, not {type(filesystem)}" + ) - def validate_schemas(self): - if self._metadata is None and self._schema is None: - if self._common_metadata is not None: - self._schema = self._common_metadata.schema + # check for single fragment dataset + single_file = None + self._base_dir = None + if not isinstance(path_or_paths, list): + if _is_path_like(path_or_paths): + path_or_paths = _stringify_path(path_or_paths) + if filesystem is None: + # path might be a URI describing the FileSystem as well + try: + filesystem, path_or_paths = FileSystem.from_uri( + path_or_paths) + except ValueError: + filesystem = LocalFileSystem(use_mmap=memory_map) + finfo = filesystem.get_file_info(path_or_paths) + if finfo.is_file: + single_file = path_or_paths + if finfo.type == FileType.Directory: + self._base_dir = path_or_paths else: - self._schema = self._pieces[0].get_metadata().schema - elif self._schema is None: - self._schema = self._metadata.schema - - # Verify schemas are all compatible - dataset_schema = self._schema.to_arrow_schema() - # Exclude the partition columns from the schema, they are provided - # by the path, not the DatasetPiece - if self._partitions is not None: - for partition_name in self._partitions.partition_names: - if dataset_schema.get_field_index(partition_name) != -1: - field_idx = dataset_schema.get_field_index(partition_name) - dataset_schema = dataset_schema.remove(field_idx) - - for piece in self._pieces: - file_metadata = piece.get_metadata() - file_schema = file_metadata.schema.to_arrow_schema() - if not dataset_schema.equals(file_schema, check_metadata=False): - raise ValueError('Schema in {!s} was different. \n' - '{!s}\n\nvs\n\n{!s}' - .format(piece, file_schema, - dataset_schema)) + single_file = path_or_paths - def read(self, columns=None, use_threads=True, use_pandas_metadata=False): - """ - Read multiple Parquet files as a single pyarrow.Table. - - Parameters - ---------- - columns : List[str] - Names of columns to read from the file. - use_threads : bool, default True - Perform multi-threaded column reads - use_pandas_metadata : bool, default False - Passed through to each dataset piece. - - Returns - ------- - pyarrow.Table - Content of the file as a table (of columns). - - Examples - -------- - Generate an example dataset: - - >>> import pyarrow as pa - >>> table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021], - ... 'n_legs': [2, 2, 4, 4, 5, 100], - ... 'animal': ["Flamingo", "Parrot", "Dog", "Horse", - ... "Brittle stars", "Centipede"]}) - >>> import pyarrow.parquet as pq - >>> pq.write_to_dataset(table, root_path='dataset_name_read', - ... partition_cols=['year'], - ... use_legacy_dataset=False) - >>> dataset = pq.ParquetDataset('dataset_name_read/', - ... use_legacy_dataset=False) - - Read multiple Parquet files as a single pyarrow.Table: - - >>> dataset.read(columns=["n_legs"]) - pyarrow.Table - n_legs: int64 - ---- - n_legs: [[5],[2],[4,100],[2,4]] - """ - tables = [] - for piece in self._pieces: - table = piece.read(columns=columns, - use_threads=use_threads, - partitions=self._partitions, - use_pandas_metadata=use_pandas_metadata) - tables.append(table) - - all_data = lib.concat_tables(tables) - - if use_pandas_metadata: - # We need to ensure that this metadata is set in the Table's schema - # so that Table.to_pandas will construct pandas.DataFrame with the - # right index - common_metadata = self._get_common_pandas_metadata() - current_metadata = all_data.schema.metadata or {} - - if common_metadata and b'pandas' not in current_metadata: - all_data = all_data.replace_schema_metadata({ - b'pandas': common_metadata}) - - return all_data - - def read_pandas(self, **kwargs): - """ - Read dataset including pandas metadata, if any. Other arguments passed - through to ParquetDataset.read, see docstring for further details. - - Parameters - ---------- - **kwargs : optional - All additional options to pass to the reader. - - Returns - ------- - pyarrow.Table - Content of the file as a table (of columns). - - Examples - -------- - Generate an example PyArrow Table and write it to a partitioned - dataset: - - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame({'year': [2020, 2022, 2021, 2022, 2019, 2021], - ... 'n_legs': [2, 2, 4, 4, 5, 100], - ... 'animal': ["Flamingo", "Parrot", "Dog", "Horse", - ... "Brittle stars", "Centipede"]}) - >>> table = pa.Table.from_pandas(df) - >>> import pyarrow.parquet as pq - >>> pq.write_table(table, 'table.parquet') - >>> dataset = pq.ParquetDataset('table.parquet', - ... use_legacy_dataset=False) - - Read dataset including pandas metadata: - - >>> dataset.read_pandas(columns=["n_legs"]) - pyarrow.Table - n_legs: int64 - ---- - n_legs: [[2,2,4,4,5,100]] - - Select pandas metadata: - - >>> dataset.read_pandas(columns=["n_legs"]).schema.pandas_metadata - {'index_columns': [{'kind': 'range', 'name': None, 'start': 0, ...} - """ - return self.read(use_pandas_metadata=True, **kwargs) - - def _get_common_pandas_metadata(self): - if self._common_metadata is None: - return None - - keyvalues = self._common_metadata.metadata - return keyvalues.get(b'pandas', None) - - def _filter(self, filters): - accepts_filter = self._partitions.filter_accepts_partition - - def one_filter_accepts(piece, filter): - return all(accepts_filter(part_key, filter, level) - for level, part_key in enumerate(piece.partition_keys)) - - def all_filters_accept(piece): - return any(all(one_filter_accepts(piece, f) for f in conjunction) - for conjunction in filters) - - self._pieces = [p for p in self._pieces if all_filters_accept(p)] - - @property - def pieces(self): - """ - DEPRECATED - """ - warnings.warn( - _DEPR_MSG.format( - "ParquetDataset.pieces", - " Specify 'use_legacy_dataset=False' while constructing the " - "ParquetDataset, and then use the '.fragments' attribute " - "instead."), - FutureWarning, stacklevel=2) - return self._pieces - - @property - def partitions(self): - """ - DEPRECATED - """ - warnings.warn( - _DEPR_MSG.format( - "ParquetDataset.partitions", - " Specify 'use_legacy_dataset=False' while constructing the " - "ParquetDataset, and then use the '.partitioning' attribute " - "instead."), - FutureWarning, stacklevel=2) - return self._partitions - - @property - def schema(self): - warnings.warn( - _DEPR_MSG.format( - "ParquetDataset.schema", - " Specify 'use_legacy_dataset=False' while constructing the " - "ParquetDataset, and then use the '.schema' attribute " - "instead (which will return an Arrow schema instead of a " - "Parquet schema)."), - FutureWarning, stacklevel=2) - return self._schema - - @property - def memory_map(self): - """ - DEPRECATED - """ - warnings.warn( - _DEPR_MSG.format("ParquetDataset.memory_map", ""), - FutureWarning, stacklevel=2) - return self._ds_metadata.memory_map - - @property - def read_dictionary(self): - """ - DEPRECATED - """ - warnings.warn( - _DEPR_MSG.format("ParquetDataset.read_dictionary", ""), - FutureWarning, stacklevel=2) - return self._ds_metadata.read_dictionary - - @property - def buffer_size(self): - """ - DEPRECATED - """ - warnings.warn( - _DEPR_MSG.format("ParquetDataset.buffer_size", ""), - FutureWarning, stacklevel=2) - return self._ds_metadata.buffer_size - - _fs = property( - operator.attrgetter('_ds_metadata.fs') - ) - - @property - def fs(self): - """ - DEPRECATED - """ - warnings.warn( - _DEPR_MSG.format( - "ParquetDataset.fs", - " Specify 'use_legacy_dataset=False' while constructing the " - "ParquetDataset, and then use the '.filesystem' attribute " - "instead."), - FutureWarning, stacklevel=2) - return self._ds_metadata.fs - - @property - def metadata(self): - """ - DEPRECATED - """ - warnings.warn( - _DEPR_MSG.format("ParquetDataset.metadata", ""), - FutureWarning, stacklevel=2) - return self._metadata - - @property - def metadata_path(self): - """ - DEPRECATED - """ - warnings.warn( - _DEPR_MSG.format("ParquetDataset.metadata_path", ""), - FutureWarning, stacklevel=2) - return self._metadata_path - - @property - def common_metadata_path(self): - """ - DEPRECATED - """ - warnings.warn( - _DEPR_MSG.format("ParquetDataset.common_metadata_path", ""), - FutureWarning, stacklevel=2) - return self._common_metadata_path - - _common_metadata = property( - operator.attrgetter('_ds_metadata.common_metadata') - ) - - @property - def common_metadata(self): - """ - DEPRECATED - """ - warnings.warn( - _DEPR_MSG.format("ParquetDataset.common_metadata", ""), - FutureWarning, stacklevel=2) - return self._ds_metadata.common_metadata - - @property - def fragments(self): - """ - A list of the Dataset source fragments or pieces with absolute - file paths. To use this property set 'use_legacy_dataset=False' - while constructing ParquetDataset object. - - Examples - -------- - Generate an example dataset: - - >>> import pyarrow as pa - >>> table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021], - ... 'n_legs': [2, 2, 4, 4, 5, 100], - ... 'animal': ["Flamingo", "Parrot", "Dog", "Horse", - ... "Brittle stars", "Centipede"]}) - >>> import pyarrow.parquet as pq - >>> pq.write_to_dataset(table, root_path='dataset_name_fragments', - ... partition_cols=['year'], - ... use_legacy_dataset=False) - >>> dataset = pq.ParquetDataset('dataset_name_fragments/', - ... use_legacy_dataset=False) - - List the fragments: - - >>> dataset.fragments - [>> import pyarrow as pa - >>> table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021], - ... 'n_legs': [2, 2, 4, 4, 5, 100], - ... 'animal': ["Flamingo", "Parrot", "Dog", "Horse", - ... "Brittle stars", "Centipede"]}) - >>> import pyarrow.parquet as pq - >>> pq.write_to_dataset(table, root_path='dataset_name_files', - ... partition_cols=['year'], - ... use_legacy_dataset=False) - >>> dataset = pq.ParquetDataset('dataset_name_files/', - ... use_legacy_dataset=False) - - List the files: - - >>> dataset.files - ['dataset_name_files/year=2019/...-0.parquet', ... - """ - raise NotImplementedError( - "To use this property set 'use_legacy_dataset=False' while " - "constructing the ParquetDataset") - - @property - def filesystem(self): - """ - The filesystem type of the Dataset source. - To use this property set 'use_legacy_dataset=False' - while constructing ParquetDataset object. - """ - raise NotImplementedError( - "To use this property set 'use_legacy_dataset=False' while " - "constructing the ParquetDataset") - - @property - def partitioning(self): - """ - The partitioning of the Dataset source, if discovered. - To use this property set 'use_legacy_dataset=False' - while constructing ParquetDataset object. - """ - raise NotImplementedError( - "To use this property set 'use_legacy_dataset=False' while " - "constructing the ParquetDataset") - - -def _make_manifest(path_or_paths, fs, pathsep='/', metadata_nthreads=1, - open_file_func=None): - partitions = None - common_metadata_path = None - metadata_path = None - - if isinstance(path_or_paths, list) and len(path_or_paths) == 1: - # Dask passes a directory as a list of length 1 - path_or_paths = path_or_paths[0] - - if _is_path_like(path_or_paths) and fs.isdir(path_or_paths): - manifest = ParquetManifest(path_or_paths, filesystem=fs, - open_file_func=open_file_func, - pathsep=getattr(fs, "pathsep", "/"), - metadata_nthreads=metadata_nthreads) - common_metadata_path = manifest.common_metadata_path - metadata_path = manifest.metadata_path - pieces = manifest.pieces - partitions = manifest.partitions - else: - if not isinstance(path_or_paths, list): - path_or_paths = [path_or_paths] - - # List of paths - if len(path_or_paths) == 0: - raise ValueError('Must pass at least one file path') - - pieces = [] - for path in path_or_paths: - if not fs.isfile(path): - raise OSError('Passed non-file path: {}' - .format(path)) - piece = ParquetDatasetPiece._create( - path, open_file_func=open_file_func) - pieces.append(piece) - - return pieces, partitions, common_metadata_path, metadata_path - - -def _is_local_file_system(fs): - return isinstance(fs, LocalFileSystem) or isinstance( - fs, legacyfs.LocalFileSystem - ) - - -class _ParquetDatasetV2: - """ - ParquetDataset shim using the Dataset API under the hood. - - Examples - -------- - Generate an example PyArrow Table and write it to a partitioned dataset: - - >>> import pyarrow as pa - >>> table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021], - ... 'n_legs': [2, 2, 4, 4, 5, 100], - ... 'animal': ["Flamingo", "Parrot", "Dog", "Horse", - ... "Brittle stars", "Centipede"]}) - >>> import pyarrow.parquet as pq - >>> pq.write_to_dataset(table, root_path='dataset_v2', - ... partition_cols=['year'], - ... use_legacy_dataset=False) - - create a ParquetDataset object from the dataset source: - - >>> dataset = pq.ParquetDataset('dataset_v2/', use_legacy_dataset=False) - - and read the data: - - >>> dataset.read().to_pandas() - n_legs animal year - 0 5 Brittle stars 2019 - 1 2 Flamingo 2020 - 2 4 Dog 2021 - 3 100 Centipede 2021 - 4 2 Parrot 2022 - 5 4 Horse 2022 - - create a ParquetDataset object with filter: - - >>> dataset = pq.ParquetDataset('dataset_v2/', - ... filters=[('n_legs','=',4)], - ... use_legacy_dataset=False) - >>> dataset.read().to_pandas() - n_legs animal year - 0 4 Dog 2021 - 1 4 Horse 2022 - """ - - def __init__(self, path_or_paths, filesystem=None, *, filters=None, - partitioning="hive", read_dictionary=None, buffer_size=None, - memory_map=False, ignore_prefixes=None, pre_buffer=True, - coerce_int96_timestamp_unit=None, schema=None, - decryption_properties=None, thrift_string_size_limit=None, - thrift_container_size_limit=None, - page_checksum_verification=False, - **kwargs): - import pyarrow.dataset as ds - - # Raise error for not supported keywords - for keyword, default in [ - ("metadata", None), ("split_row_groups", False), - ("validate_schema", True), ("metadata_nthreads", None)]: - if keyword in kwargs and kwargs[keyword] is not default: - raise ValueError( - "Keyword '{0}' is not yet supported with the new " - "Dataset API".format(keyword)) - - # map format arguments - read_options = { - "pre_buffer": pre_buffer, - "coerce_int96_timestamp_unit": coerce_int96_timestamp_unit, - "thrift_string_size_limit": thrift_string_size_limit, - "thrift_container_size_limit": thrift_container_size_limit, - "page_checksum_verification": page_checksum_verification, - } - if buffer_size: - read_options.update(use_buffered_stream=True, - buffer_size=buffer_size) - if read_dictionary is not None: - read_options.update(dictionary_columns=read_dictionary) - - if decryption_properties is not None: - read_options.update(decryption_properties=decryption_properties) - - self._filter_expression = None - if filters is not None: - self._filter_expression = filters_to_expression(filters) - - # map old filesystems to new one - if filesystem is not None: - filesystem = _ensure_filesystem( - filesystem, use_mmap=memory_map) - elif filesystem is None and memory_map: - # if memory_map is specified, assume local file system (string - # path can in principle be URI for any filesystem) - filesystem = LocalFileSystem(use_mmap=memory_map) - - # This needs to be checked after _ensure_filesystem, because that - # handles the case of an fsspec LocalFileSystem - if ( - hasattr(path_or_paths, "__fspath__") and - filesystem is not None and - not _is_local_file_system(filesystem) - ): - raise TypeError( - "Path-like objects with __fspath__ must only be used with " - f"local file systems, not {type(filesystem)}" - ) - - # check for single fragment dataset - single_file = None - self._base_dir = None - if not isinstance(path_or_paths, list): - if _is_path_like(path_or_paths): - path_or_paths = _stringify_path(path_or_paths) - if filesystem is None: - # path might be a URI describing the FileSystem as well - try: - filesystem, path_or_paths = FileSystem.from_uri( - path_or_paths) - except ValueError: - filesystem = LocalFileSystem(use_mmap=memory_map) - finfo = filesystem.get_file_info(path_or_paths) - if finfo.is_file: - single_file = path_or_paths - if finfo.type == FileType.Directory: - self._base_dir = path_or_paths - else: - single_file = path_or_paths - - parquet_format = ds.ParquetFileFormat(**read_options) + parquet_format = ds.ParquetFileFormat(**read_options) if single_file is not None: fragment = parquet_format.make_fragment(single_file, filesystem) @@ -2540,12 +1357,7 @@ def __init__(self, path_or_paths, filesystem=None, *, filters=None, ignore_prefixes=ignore_prefixes) def equals(self, other): - if isinstance(other, ParquetDataset): - raise TypeError( - "`other` must be an instance of ParquetDataset constructed " - "with `use_legacy_dataset=False`" - ) - if not isinstance(other, _ParquetDatasetV2): + if not isinstance(other, ParquetDataset): raise TypeError('`other` must be an instance of ParquetDataset') return (self.schema == other.schema and @@ -2576,10 +1388,8 @@ def schema(self): ... "Brittle stars", "Centipede"]}) >>> import pyarrow.parquet as pq >>> pq.write_to_dataset(table, root_path='dataset_v2_schema', - ... partition_cols=['year'], - ... use_legacy_dataset=False) - >>> dataset = pq.ParquetDataset('dataset_v2_schema/', - ... use_legacy_dataset=False) + ... partition_cols=['year']) + >>> dataset = pq.ParquetDataset('dataset_v2_schema/') Read the schema: @@ -2598,8 +1408,7 @@ def read(self, columns=None, use_threads=True, use_pandas_metadata=False): ---------- columns : List[str] Names of columns to read from the dataset. The partition fields - are not automatically included (in contrast to when setting - ``use_legacy_dataset=True``). + are not automatically included. use_threads : bool, default True Perform multi-threaded column reads. use_pandas_metadata : bool, default False @@ -2622,10 +1431,8 @@ def read(self, columns=None, use_threads=True, use_pandas_metadata=False): ... "Brittle stars", "Centipede"]}) >>> import pyarrow.parquet as pq >>> pq.write_to_dataset(table, root_path='dataset_v2_read', - ... partition_cols=['year'], - ... use_legacy_dataset=False) - >>> dataset = pq.ParquetDataset('dataset_v2_read/', - ... use_legacy_dataset=False) + ... partition_cols=['year']) + >>> dataset = pq.ParquetDataset('dataset_v2_read/') Read the dataset: @@ -2694,7 +1501,12 @@ def _get_common_pandas_metadata(self): def read_pandas(self, **kwargs): """ Read dataset including pandas metadata, if any. Other arguments passed - through to ParquetDataset.read, see docstring for further details. + through to :func:`read`, see docstring for further details. + + Parameters + ---------- + **kwargs : optional + Additional options for :func:`read` Examples -------- @@ -2709,8 +1521,7 @@ def read_pandas(self, **kwargs): >>> table = pa.Table.from_pandas(df) >>> import pyarrow.parquet as pq >>> pq.write_table(table, 'table_V2.parquet') - >>> dataset = pq.ParquetDataset('table_V2.parquet', - ... use_legacy_dataset=False) + >>> dataset = pq.ParquetDataset('table_V2.parquet') Read the dataset with pandas metadata: @@ -2725,14 +1536,6 @@ def read_pandas(self, **kwargs): """ return self.read(use_pandas_metadata=True, **kwargs) - @property - def pieces(self): - warnings.warn( - _DEPR_MSG.format("ParquetDataset.pieces", - " Use the '.fragments' attribute instead"), - FutureWarning, stacklevel=2) - return list(self._dataset.get_fragments()) - @property def fragments(self): """ @@ -2750,10 +1553,8 @@ def fragments(self): ... "Brittle stars", "Centipede"]}) >>> import pyarrow.parquet as pq >>> pq.write_to_dataset(table, root_path='dataset_v2_fragments', - ... partition_cols=['year'], - ... use_legacy_dataset=False) - >>> dataset = pq.ParquetDataset('dataset_v2_fragments/', - ... use_legacy_dataset=False) + ... partition_cols=['year']) + >>> dataset = pq.ParquetDataset('dataset_v2_fragments/') List the fragments: @@ -2778,10 +1579,8 @@ def files(self): ... "Brittle stars", "Centipede"]}) >>> import pyarrow.parquet as pq >>> pq.write_to_dataset(table, root_path='dataset_v2_files', - ... partition_cols=['year'], - ... use_legacy_dataset=False) - >>> dataset = pq.ParquetDataset('dataset_v2_files/', - ... use_legacy_dataset=False) + ... partition_cols=['year']) + >>> dataset = pq.ParquetDataset('dataset_v2_files/') List the files: @@ -2822,8 +1621,6 @@ def partitioning(self): no columns. use_threads : bool, default True Perform multi-threaded column reads. -metadata : FileMetaData - If separately computed schema : Schema, optional Optionally provide the Schema for the parquet dataset, in which case it will not be inferred from the source. @@ -2836,30 +1633,21 @@ def partitioning(self): Rows which do not match the filter predicate will be removed from scanned data. Partition keys embedded in a nested directory structure will be exploited to avoid loading files at all if they contain no matching rows. - If `use_legacy_dataset` is True, filters can only reference partition - keys and only a hive-style directory structure is supported. When - setting `use_legacy_dataset` to False, also within-file level filtering - and different partitioning schemes are supported. + Within-file level filtering and different partitioning schemes are supported. {3} -use_legacy_dataset : bool, default False - By default, `read_table` uses the new Arrow Datasets API since - pyarrow 1.0.0. Among other things, this allows to pass `filters` - for all columns and not only the partition keys, enables - different partitioning schemes, etc. - Set to True to use the legacy behaviour (this option is deprecated, - and the legacy implementation will be removed in a future version). +use_legacy_dataset : bool, optional + Deprecated and has no effect from PyArrow version 15.0.0. ignore_prefixes : list, optional Files matching any of these prefixes will be ignored by the - discovery process if use_legacy_dataset=False. + discovery process. This is matched to the basename of a path. By default this is ['.', '_']. Note that discovery happens only if a directory is passed as source. pre_buffer : bool, default True Coalesce and issue file reads in parallel to improve performance on high-latency filesystems (e.g. S3). If True, Arrow will use a - background I/O thread pool. This option is only supported for - use_legacy_dataset=False. If using a filesystem layer that itself + background I/O thread pool. If using a filesystem layer that itself performs readahead (e.g. fsspec's S3FS), disable readahead for best results. coerce_int96_timestamp_unit : str, default None @@ -2968,129 +1756,78 @@ def partitioning(self): """ -def read_table(source, *, columns=None, use_threads=True, metadata=None, +def read_table(source, *, columns=None, use_threads=True, schema=None, use_pandas_metadata=False, read_dictionary=None, memory_map=False, buffer_size=0, partitioning="hive", - filesystem=None, filters=None, use_legacy_dataset=False, + filesystem=None, filters=None, use_legacy_dataset=None, ignore_prefixes=None, pre_buffer=True, coerce_int96_timestamp_unit=None, decryption_properties=None, thrift_string_size_limit=None, thrift_container_size_limit=None, page_checksum_verification=False): - if not use_legacy_dataset: - if metadata is not None: + + if use_legacy_dataset is not None: + warnings.warn( + "Passing 'use_legacy_dataset' is deprecated as of pyarrow 15.0.0 " + "and will be removed in a future version.", + FutureWarning, stacklevel=2) + + try: + dataset = ParquetDataset( + source, + schema=schema, + filesystem=filesystem, + partitioning=partitioning, + memory_map=memory_map, + read_dictionary=read_dictionary, + buffer_size=buffer_size, + filters=filters, + ignore_prefixes=ignore_prefixes, + pre_buffer=pre_buffer, + coerce_int96_timestamp_unit=coerce_int96_timestamp_unit, + thrift_string_size_limit=thrift_string_size_limit, + thrift_container_size_limit=thrift_container_size_limit, + page_checksum_verification=page_checksum_verification, + ) + except ImportError: + # fall back on ParquetFile for simple cases when pyarrow.dataset + # module is not available + if filters is not None: raise ValueError( - "The 'metadata' keyword is no longer supported with the new " - "datasets-based implementation. Specify " - "'use_legacy_dataset=True' to temporarily recover the old " - "behaviour." - ) - try: - dataset = _ParquetDatasetV2( - source, - schema=schema, - filesystem=filesystem, - partitioning=partitioning, - memory_map=memory_map, - read_dictionary=read_dictionary, - buffer_size=buffer_size, - filters=filters, - ignore_prefixes=ignore_prefixes, - pre_buffer=pre_buffer, - coerce_int96_timestamp_unit=coerce_int96_timestamp_unit, - thrift_string_size_limit=thrift_string_size_limit, - thrift_container_size_limit=thrift_container_size_limit, - page_checksum_verification=page_checksum_verification, + "the 'filters' keyword is not supported when the " + "pyarrow.dataset module is not available" ) - except ImportError: - # fall back on ParquetFile for simple cases when pyarrow.dataset - # module is not available - if filters is not None: - raise ValueError( - "the 'filters' keyword is not supported when the " - "pyarrow.dataset module is not available" - ) - if partitioning != "hive": - raise ValueError( - "the 'partitioning' keyword is not supported when the " - "pyarrow.dataset module is not available" - ) - if schema is not None: - raise ValueError( - "the 'schema' argument is not supported when the " - "pyarrow.dataset module is not available" - ) - filesystem, path = _resolve_filesystem_and_path(source, filesystem) - if filesystem is not None: - source = filesystem.open_input_file(path) - # TODO test that source is not a directory or a list - dataset = ParquetFile( - source, metadata=metadata, read_dictionary=read_dictionary, - memory_map=memory_map, buffer_size=buffer_size, - pre_buffer=pre_buffer, - coerce_int96_timestamp_unit=coerce_int96_timestamp_unit, - decryption_properties=decryption_properties, - thrift_string_size_limit=thrift_string_size_limit, - thrift_container_size_limit=thrift_container_size_limit, - page_checksum_verification=page_checksum_verification, + if partitioning != "hive": + raise ValueError( + "the 'partitioning' keyword is not supported when the " + "pyarrow.dataset module is not available" ) - - return dataset.read(columns=columns, use_threads=use_threads, - use_pandas_metadata=use_pandas_metadata) - - warnings.warn( - "Passing 'use_legacy_dataset=True' to get the legacy behaviour is " - "deprecated as of pyarrow 8.0.0, and the legacy implementation will " - "be removed in a future version.", - FutureWarning, stacklevel=2) - - if ignore_prefixes is not None: - raise ValueError( - "The 'ignore_prefixes' keyword is only supported when " - "use_legacy_dataset=False") - - if page_checksum_verification: - raise ValueError( - "The 'page_checksum_verification' keyword is only supported when " - "use_legacy_dataset=False") - - if schema is not None: - raise ValueError( - "The 'schema' argument is only supported when " - "use_legacy_dataset=False") - - if _is_path_like(source): - with warnings.catch_warnings(): - # Suppress second warning from ParquetDataset constructor - warnings.filterwarnings( - "ignore", "Passing 'use_legacy_dataset", FutureWarning) - pf = ParquetDataset( - source, metadata=metadata, memory_map=memory_map, - read_dictionary=read_dictionary, - buffer_size=buffer_size, - filesystem=filesystem, filters=filters, - partitioning=partitioning, - coerce_int96_timestamp_unit=coerce_int96_timestamp_unit, - use_legacy_dataset=True, + if schema is not None: + raise ValueError( + "the 'schema' argument is not supported when the " + "pyarrow.dataset module is not available" ) - else: - pf = ParquetFile( - source, metadata=metadata, - read_dictionary=read_dictionary, - memory_map=memory_map, - buffer_size=buffer_size, + filesystem, path = _resolve_filesystem_and_path(source, filesystem) + if filesystem is not None: + source = filesystem.open_input_file(path) + # TODO test that source is not a directory or a list + dataset = ParquetFile( + source, read_dictionary=read_dictionary, + memory_map=memory_map, buffer_size=buffer_size, + pre_buffer=pre_buffer, coerce_int96_timestamp_unit=coerce_int96_timestamp_unit, - decryption_properties=decryption_properties + decryption_properties=decryption_properties, + thrift_string_size_limit=thrift_string_size_limit, + thrift_container_size_limit=thrift_container_size_limit, + page_checksum_verification=page_checksum_verification, ) - return pf.read(columns=columns, use_threads=use_threads, - use_pandas_metadata=use_pandas_metadata) + return dataset.read(columns=columns, use_threads=use_threads, + use_pandas_metadata=use_pandas_metadata) -read_table.__doc__ = _read_table_docstring.format( - """Read a Table from Parquet format -Note: starting with pyarrow 1.0, the default for `use_legacy_dataset` is -switched to False.""", +read_table.__doc__ = _read_table_docstring.format( + """Read a Table from Parquet format""", "\n".join(("""use_pandas_metadata : bool, default False If True and file has custom pandas schema metadata, ensure that index columns are also loaded.""", _read_docstring_common)), @@ -3233,23 +1970,13 @@ def write_table(table, where, row_group_size=None, version='2.6', """.format(_parquet_writer_arg_docs, _write_table_example) -def _mkdir_if_not_exists(fs, path): - if fs._isfilestore() and not fs.exists(path): - try: - fs.mkdir(path) - except OSError: - assert fs.exists(path) - - def write_to_dataset(table, root_path, partition_cols=None, - partition_filename_cb=None, filesystem=None, - use_legacy_dataset=None, schema=None, - partitioning=None, basename_template=None, - use_threads=None, file_visitor=None, - existing_data_behavior=None, + filesystem=None, use_legacy_dataset=None, + schema=None, partitioning=None, + basename_template=None, use_threads=None, + file_visitor=None, existing_data_behavior=None, **kwargs): - """Wrapper around dataset.write_dataset (when use_legacy_dataset=False) or - parquet.write_table (when use_legacy_dataset=True) for writing a Table to + """Wrapper around dataset.write_dataset for writing a Table to Parquet format by partitions. For each combination of partition columns and values, a subdirectories are created in the following @@ -3271,45 +1998,31 @@ def write_to_dataset(table, root_path, partition_cols=None, ---------- table : pyarrow.Table root_path : str, pathlib.Path - The root directory of the dataset + The root directory of the dataset. partition_cols : list, Column names by which to partition the dataset. - Columns are partitioned in the order they are given - partition_filename_cb : callable, - A callback function that takes the partition key(s) as an argument - and allow you to override the partition filename. If nothing is - passed, the filename will consist of a uuid. - This option is only supported for use_legacy_dataset=True. - When use_legacy_dataset=None and this option is specified, - use_legacy_dataset will be set to True. + Columns are partitioned in the order they are given. filesystem : FileSystem, default None If nothing passed, will be inferred based on path. Path will try to be found in the local on-disk filesystem otherwise it will be parsed as an URI to determine the filesystem. - use_legacy_dataset : bool - Default is False. Set to True to use the legacy behaviour - (this option is deprecated, and the legacy implementation will be - removed in a future version). The legacy implementation still - supports the `partition_filename_cb` keyword but is less efficient - when using partition columns. + use_legacy_dataset : bool, optional + Deprecated and has no effect from PyArrow version 15.0.0. schema : Schema, optional - This option is only supported for use_legacy_dataset=False. + This Schema of the dataset. partitioning : Partitioning or list[str], optional The partitioning scheme specified with the ``pyarrow.dataset.partitioning()`` function or a list of field names. When providing a list of field names, you can use ``partitioning_flavor`` to drive which partitioning type should be used. - This option is only supported for use_legacy_dataset=False. basename_template : str, optional A template string used to generate basenames of written data files. The token '{i}' will be replaced with an automatically incremented integer. If not specified, it defaults to "guid-{i}.parquet". - This option is only supported for use_legacy_dataset=False. use_threads : bool, default True Write files in parallel. If enabled, then maximum parallelism will be used determined by the number of available CPU cores. - This option is only supported for use_legacy_dataset=False. file_visitor : function If set, this function will be called with a WrittenFile instance for each file created during the call. This object will have both @@ -3330,7 +2043,6 @@ def write_to_dataset(table, root_path, partition_cols=None, def file_visitor(written_file): visited_paths.append(written_file.path) - This option is only supported for use_legacy_dataset=False. existing_data_behavior : 'overwrite_or_ignore' | 'error' | \ 'delete_matching' Controls how the dataset will handle data that already exists in @@ -3348,15 +2060,12 @@ def file_visitor(written_file): dataset. The first time each partition directory is encountered the entire directory will be deleted. This allows you to overwrite old partitions completely. - This option is only supported for use_legacy_dataset=False. **kwargs : dict, - When use_legacy_dataset=False, used as additional kwargs for - `dataset.write_dataset` function for matching kwargs, and remainder to - `ParquetFileFormat.make_write_options`. See the docstring - of `write_table` and `dataset.write_dataset` for the available options. - When use_legacy_dataset=True, used as additional kwargs for - `parquet.write_table` function (See docstring for `write_table` - or `ParquetWriter` for more information). + Used as additional kwargs for :func:`pyarrow.dataset.write_dataset` + function for matching kwargs, and remainder to + :func:`pyarrow.dataset.ParquetFileFormat.make_write_options`. + See the docstring of :func:`write_table` and + :func:`pyarrow.dataset.write_dataset` for the available options. Using `metadata_collector` in kwargs allows one to collect the file metadata instances of dataset pieces. The file paths in the ColumnChunkMetaData will be set relative to `root_path`. @@ -3376,194 +2085,79 @@ def file_visitor(written_file): >>> import pyarrow.parquet as pq >>> pq.write_to_dataset(table, root_path='dataset_name_3', ... partition_cols=['year']) - >>> pq.ParquetDataset('dataset_name_3', use_legacy_dataset=False).files + >>> pq.ParquetDataset('dataset_name_3').files ['dataset_name_3/year=2019/...-0.parquet', ... Write a single Parquet file into the root folder: >>> pq.write_to_dataset(table, root_path='dataset_name_4') - >>> pq.ParquetDataset('dataset_name_4/', use_legacy_dataset=False).files + >>> pq.ParquetDataset('dataset_name_4/').files ['dataset_name_4/...-0.parquet'] """ - # Choose the implementation - if use_legacy_dataset is None: - # if partition_filename_cb is specified -> - # default to the old implementation - if partition_filename_cb: - use_legacy_dataset = True - # otherwise the default is False - else: - use_legacy_dataset = False + if use_legacy_dataset is not None: + warnings.warn( + "Passing 'use_legacy_dataset' is deprecated as of pyarrow 15.0.0 " + "and will be removed in a future version.", + FutureWarning, stacklevel=2) + + metadata_collector = kwargs.pop('metadata_collector', None) # Check for conflicting keywords - msg_confl_0 = ( - "The '{0}' argument is not supported by use_legacy_dataset={2}. " - "Use only '{1}' instead." - ) - msg_confl_1 = ( - "The '{1}' argument is not supported by use_legacy_dataset={2}. " + msg_confl = ( + "The '{1}' argument is not supported. " "Use only '{0}' instead." ) - msg_confl = msg_confl_0 if use_legacy_dataset else msg_confl_1 - if partition_filename_cb is not None and basename_template is not None: - raise ValueError(msg_confl.format("basename_template", - "partition_filename_cb", - use_legacy_dataset)) - if partition_cols is not None and partitioning is not None: raise ValueError(msg_confl.format("partitioning", - "partition_cols", - use_legacy_dataset)) + "partition_cols")) - metadata_collector = kwargs.pop('metadata_collector', None) if metadata_collector is not None and file_visitor is not None: raise ValueError(msg_confl.format("file_visitor", - "metadata_collector", - use_legacy_dataset)) + "metadata_collector")) - # New dataset implementation - if not use_legacy_dataset: - import pyarrow.dataset as ds + import pyarrow.dataset as ds - # extract write_dataset specific options - # reset assumed to go to make_write_options - write_dataset_kwargs = dict() - for key in inspect.signature(ds.write_dataset).parameters: - if key in kwargs: - write_dataset_kwargs[key] = kwargs.pop(key) - write_dataset_kwargs['max_rows_per_group'] = kwargs.pop( - 'row_group_size', kwargs.pop("chunk_size", None) - ) - # raise for unsupported keywords - msg = ( - "The '{}' argument is not supported with the new dataset " - "implementation." - ) - if metadata_collector is not None: - def file_visitor(written_file): - metadata_collector.append(written_file.metadata) - if partition_filename_cb is not None: - raise ValueError(msg.format("partition_filename_cb")) + # extract write_dataset specific options + # reset assumed to go to make_write_options + write_dataset_kwargs = dict() + for key in inspect.signature(ds.write_dataset).parameters: + if key in kwargs: + write_dataset_kwargs[key] = kwargs.pop(key) + write_dataset_kwargs['max_rows_per_group'] = kwargs.pop( + 'row_group_size', kwargs.pop("chunk_size", None) + ) - # map format arguments - parquet_format = ds.ParquetFileFormat() - write_options = parquet_format.make_write_options(**kwargs) + if metadata_collector is not None: + def file_visitor(written_file): + metadata_collector.append(written_file.metadata) - # map old filesystems to new one - if filesystem is not None: - filesystem = _ensure_filesystem(filesystem) - - if partition_cols: - part_schema = table.select(partition_cols).schema - partitioning = ds.partitioning(part_schema, flavor="hive") - - if basename_template is None: - basename_template = guid() + '-{i}.parquet' - - if existing_data_behavior is None: - existing_data_behavior = 'overwrite_or_ignore' - - ds.write_dataset( - table, root_path, filesystem=filesystem, - format=parquet_format, file_options=write_options, schema=schema, - partitioning=partitioning, use_threads=use_threads, - file_visitor=file_visitor, - basename_template=basename_template, - existing_data_behavior=existing_data_behavior, - **write_dataset_kwargs) - return - - # warnings and errors when using legacy implementation - if use_legacy_dataset: - warnings.warn( - "Passing 'use_legacy_dataset=True' to get the legacy behaviour is " - "deprecated as of pyarrow 8.0.0, and the legacy implementation " - "will be removed in a future version.", - FutureWarning, stacklevel=2) - msg2 = ( - "The '{}' argument is not supported with the legacy " - "implementation. To use this argument specify " - "'use_legacy_dataset=False' while constructing the " - "ParquetDataset." - ) - if schema is not None: - raise ValueError(msg2.format("schema")) - if partitioning is not None: - raise ValueError(msg2.format("partitioning")) - if use_threads is not None: - raise ValueError(msg2.format("use_threads")) - if file_visitor is not None: - raise ValueError(msg2.format("file_visitor")) - if existing_data_behavior is not None: - raise ValueError(msg2.format("existing_data_behavior")) - if basename_template is not None: - raise ValueError(msg2.format("basename_template")) - if partition_filename_cb is not None: - warnings.warn( - _DEPR_MSG.format("partition_filename_cb", " Specify " - "'use_legacy_dataset=False' while constructing " - "the ParquetDataset, and then use the " - "'basename_template' parameter instead. For " - "usage see `pyarrow.dataset.write_dataset`"), - FutureWarning, stacklevel=2) + # map format arguments + parquet_format = ds.ParquetFileFormat() + write_options = parquet_format.make_write_options(**kwargs) - # Legacy implementation - fs, root_path = legacyfs.resolve_filesystem_and_path(root_path, filesystem) - - _mkdir_if_not_exists(fs, root_path) - - if partition_cols is not None and len(partition_cols) > 0: - df = table.to_pandas() - partition_keys = [df[col] for col in partition_cols] - data_df = df.drop(partition_cols, axis='columns') - data_cols = df.columns.drop(partition_cols) - if len(data_cols) == 0: - raise ValueError('No data left to save outside partition columns') - - subschema = table.schema - - # ARROW-2891: Ensure the output_schema is preserved when writing a - # partitioned dataset - for col in table.schema.names: - if col in partition_cols: - subschema = subschema.remove(subschema.get_field_index(col)) - - # ARROW-17829: avoid deprecation warnings for df.groupby - # https://github.com/pandas-dev/pandas/issues/42795 - if len(partition_keys) == 1: - partition_keys = partition_keys[0] - - for keys, subgroup in data_df.groupby(partition_keys, observed=True): - if not isinstance(keys, tuple): - keys = (keys,) - subdir = '/'.join( - ['{colname}={value}'.format(colname=name, value=val) - for name, val in zip(partition_cols, keys)]) - subtable = pa.Table.from_pandas(subgroup, schema=subschema, - safe=False) - _mkdir_if_not_exists(fs, '/'.join([root_path, subdir])) - if partition_filename_cb: - outfile = partition_filename_cb(keys) - else: - outfile = guid() + '.parquet' - relative_path = '/'.join([subdir, outfile]) - full_path = '/'.join([root_path, relative_path]) - with fs.open(full_path, 'wb') as f: - write_table(subtable, f, metadata_collector=metadata_collector, - **kwargs) - if metadata_collector is not None: - metadata_collector[-1].set_file_path(relative_path) - else: - if partition_filename_cb: - outfile = partition_filename_cb(None) - else: - outfile = guid() + '.parquet' - full_path = '/'.join([root_path, outfile]) - with fs.open(full_path, 'wb') as f: - write_table(table, f, metadata_collector=metadata_collector, - **kwargs) - if metadata_collector is not None: - metadata_collector[-1].set_file_path(outfile) + # map old filesystems to new one + if filesystem is not None: + filesystem = _ensure_filesystem(filesystem) + + if partition_cols: + part_schema = table.select(partition_cols).schema + partitioning = ds.partitioning(part_schema, flavor="hive") + + if basename_template is None: + basename_template = guid() + '-{i}.parquet' + + if existing_data_behavior is None: + existing_data_behavior = 'overwrite_or_ignore' + + ds.write_dataset( + table, root_path, filesystem=filesystem, + format=parquet_format, file_options=write_options, schema=schema, + partitioning=partitioning, use_threads=use_threads, + file_visitor=file_visitor, + basename_template=basename_template, + existing_data_behavior=existing_data_behavior, + **write_dataset_kwargs) + return def write_metadata(schema, where, metadata_collector=None, filesystem=None, @@ -3741,15 +2335,11 @@ def read_schema(where, memory_map=False, decryption_properties=None, "FileEncryptionProperties", "FileMetaData", "ParquetDataset", - "ParquetDatasetPiece", "ParquetFile", "ParquetLogicalType", - "ParquetManifest", - "ParquetPartitions", "ParquetReader", "ParquetSchema", "ParquetWriter", - "PartitionSet", "RowGroupMetaData", "SortingColumn", "Statistics", diff --git a/python/pyarrow/tests/parquet/__init__.py b/python/pyarrow/tests/parquet/__init__.py index 4c4e8240b8736..d08d67d2860f4 100644 --- a/python/pyarrow/tests/parquet/__init__.py +++ b/python/pyarrow/tests/parquet/__init__.py @@ -21,7 +21,4 @@ # Ignore these with pytest ... -m 'not parquet' pytestmark = [ pytest.mark.parquet, - pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning" - ), ] diff --git a/python/pyarrow/tests/parquet/common.py b/python/pyarrow/tests/parquet/common.py index 4401d3ca6bb75..8365ed5b28543 100644 --- a/python/pyarrow/tests/parquet/common.py +++ b/python/pyarrow/tests/parquet/common.py @@ -18,31 +18,10 @@ import io import numpy as np -import pytest import pyarrow as pa from pyarrow.tests import util -legacy_filter_mark = pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy:FutureWarning" -) - -parametrize_legacy_dataset = pytest.mark.parametrize( - "use_legacy_dataset", - [pytest.param(True, marks=legacy_filter_mark), - pytest.param(False, marks=pytest.mark.dataset)] -) -parametrize_legacy_dataset_not_supported = pytest.mark.parametrize( - "use_legacy_dataset", - [pytest.param(True, marks=legacy_filter_mark), - pytest.param(False, marks=pytest.mark.skip)] -) -parametrize_legacy_dataset_fixed = pytest.mark.parametrize( - "use_legacy_dataset", - [pytest.param(True, marks=[pytest.mark.xfail, legacy_filter_mark]), - pytest.param(False, marks=pytest.mark.dataset)] -) - def _write_table(table, path, **kwargs): # So we see the ImportError somewhere @@ -65,19 +44,18 @@ def _read_table(*args, **kwargs): def _roundtrip_table(table, read_table_kwargs=None, - write_table_kwargs=None, use_legacy_dataset=False): + write_table_kwargs=None): read_table_kwargs = read_table_kwargs or {} write_table_kwargs = write_table_kwargs or {} writer = pa.BufferOutputStream() _write_table(table, writer, **write_table_kwargs) reader = pa.BufferReader(writer.getvalue()) - return _read_table(reader, use_legacy_dataset=use_legacy_dataset, - **read_table_kwargs) + return _read_table(reader, **read_table_kwargs) def _check_roundtrip(table, expected=None, read_table_kwargs=None, - use_legacy_dataset=False, **write_table_kwargs): + **write_table_kwargs): if expected is None: expected = table @@ -85,20 +63,17 @@ def _check_roundtrip(table, expected=None, read_table_kwargs=None, # intentionally check twice result = _roundtrip_table(table, read_table_kwargs=read_table_kwargs, - write_table_kwargs=write_table_kwargs, - use_legacy_dataset=use_legacy_dataset) + write_table_kwargs=write_table_kwargs) assert result.equals(expected) result = _roundtrip_table(result, read_table_kwargs=read_table_kwargs, - write_table_kwargs=write_table_kwargs, - use_legacy_dataset=use_legacy_dataset) + write_table_kwargs=write_table_kwargs) assert result.equals(expected) -def _roundtrip_pandas_dataframe(df, write_kwargs, use_legacy_dataset=False): +def _roundtrip_pandas_dataframe(df, write_kwargs): table = pa.Table.from_pandas(df) result = _roundtrip_table( - table, write_table_kwargs=write_kwargs, - use_legacy_dataset=use_legacy_dataset) + table, write_table_kwargs=write_kwargs) return result.to_pandas() diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index 83e6ebeb7a1fc..3c867776ac052 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -28,7 +28,6 @@ from pyarrow.filesystem import LocalFileSystem, FileSystem from pyarrow.tests import util from pyarrow.tests.parquet.common import (_check_roundtrip, _roundtrip_table, - parametrize_legacy_dataset, _test_dataframe) try: @@ -63,21 +62,18 @@ def test_parquet_invalid_version(tempdir): data_page_version="2.2") -@parametrize_legacy_dataset -def test_set_data_page_size(use_legacy_dataset): +def test_set_data_page_size(): arr = pa.array([1, 2, 3] * 100000) t = pa.Table.from_arrays([arr], names=['f0']) # 128K, 512K page_sizes = [2 << 16, 2 << 18] for target_page_size in page_sizes: - _check_roundtrip(t, data_page_size=target_page_size, - use_legacy_dataset=use_legacy_dataset) + _check_roundtrip(t, data_page_size=target_page_size) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_set_write_batch_size(use_legacy_dataset): +def test_set_write_batch_size(): df = _test_dataframe(100) table = pa.Table.from_pandas(df, preserve_index=False) @@ -87,8 +83,7 @@ def test_set_write_batch_size(use_legacy_dataset): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_set_dictionary_pagesize_limit(use_legacy_dataset): +def test_set_dictionary_pagesize_limit(): df = _test_dataframe(100) table = pa.Table.from_pandas(df, preserve_index=False) @@ -101,8 +96,7 @@ def test_set_dictionary_pagesize_limit(use_legacy_dataset): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_chunked_table_write(use_legacy_dataset): +def test_chunked_table_write(): # ARROW-232 tables = [] batch = pa.RecordBatch.from_pandas(alltypes_sample(size=10)) @@ -116,66 +110,56 @@ def test_chunked_table_write(use_legacy_dataset): for table in tables: _check_roundtrip( table, version='2.6', - use_legacy_dataset=use_legacy_dataset, data_page_version=data_page_version, use_dictionary=use_dictionary) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_memory_map(tempdir, use_legacy_dataset): +def test_memory_map(tempdir): df = alltypes_sample(size=10) table = pa.Table.from_pandas(df) _check_roundtrip(table, read_table_kwargs={'memory_map': True}, - version='2.6', use_legacy_dataset=use_legacy_dataset) + version='2.6') filename = str(tempdir / 'tmp_file') with open(filename, 'wb') as f: _write_table(table, f, version='2.6') - table_read = pq.read_pandas(filename, memory_map=True, - use_legacy_dataset=use_legacy_dataset) + table_read = pq.read_pandas(filename, memory_map=True) assert table_read.equals(table) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_enable_buffered_stream(tempdir, use_legacy_dataset): +def test_enable_buffered_stream(tempdir): df = alltypes_sample(size=10) table = pa.Table.from_pandas(df) _check_roundtrip(table, read_table_kwargs={'buffer_size': 1025}, - version='2.6', use_legacy_dataset=use_legacy_dataset) + version='2.6') filename = str(tempdir / 'tmp_file') with open(filename, 'wb') as f: _write_table(table, f, version='2.6') - table_read = pq.read_pandas(filename, buffer_size=4096, - use_legacy_dataset=use_legacy_dataset) + table_read = pq.read_pandas(filename, buffer_size=4096) assert table_read.equals(table) -@parametrize_legacy_dataset -def test_special_chars_filename(tempdir, use_legacy_dataset): +def test_special_chars_filename(tempdir): table = pa.Table.from_arrays([pa.array([42])], ["ints"]) filename = "foo # bar" path = tempdir / filename assert not path.exists() _write_table(table, str(path)) assert path.exists() - table_read = _read_table(str(path), use_legacy_dataset=use_legacy_dataset) + table_read = _read_table(str(path)) assert table_read.equals(table) -@parametrize_legacy_dataset -def test_invalid_source(use_legacy_dataset): +def test_invalid_source(): # Test that we provide an helpful error message pointing out # that None wasn't expected when trying to open a Parquet None file. - # - # Depending on use_legacy_dataset the message changes slightly - # but in both cases it should point out that None wasn't expected. with pytest.raises(TypeError, match="None"): - pq.read_table(None, use_legacy_dataset=use_legacy_dataset) + pq.read_table(None) with pytest.raises(TypeError, match="None"): pq.ParquetFile(None) @@ -193,8 +177,7 @@ def test_file_with_over_int16_max_row_groups(): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_empty_table_roundtrip(use_legacy_dataset): +def test_empty_table_roundtrip(): df = alltypes_sample(size=10) # Create a non-empty table to infer the types correctly, then slice to 0 @@ -206,19 +189,17 @@ def test_empty_table_roundtrip(use_legacy_dataset): assert table.schema.field('null').type == pa.null() assert table.schema.field('null_list').type == pa.list_(pa.null()) _check_roundtrip( - table, version='2.6', use_legacy_dataset=use_legacy_dataset) + table, version='2.6') @pytest.mark.pandas -@parametrize_legacy_dataset -def test_empty_table_no_columns(use_legacy_dataset): +def test_empty_table_no_columns(): df = pd.DataFrame() empty = pa.Table.from_pandas(df, preserve_index=False) - _check_roundtrip(empty, use_legacy_dataset=use_legacy_dataset) + _check_roundtrip(empty) -@parametrize_legacy_dataset -def test_write_nested_zero_length_array_chunk_failure(use_legacy_dataset): +def test_write_nested_zero_length_array_chunk_failure(): # Bug report in ARROW-3792 cols = OrderedDict( int32=pa.int32(), @@ -243,17 +224,16 @@ def test_write_nested_zero_length_array_chunk_failure(use_legacy_dataset): my_batches = [pa.RecordBatch.from_arrays(batch, schema=pa.schema(cols)) for batch in my_arrays] tbl = pa.Table.from_batches(my_batches, pa.schema(cols)) - _check_roundtrip(tbl, use_legacy_dataset=use_legacy_dataset) + _check_roundtrip(tbl) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_multiple_path_types(tempdir, use_legacy_dataset): +def test_multiple_path_types(tempdir): # Test compatibility with PEP 519 path-like objects path = tempdir / 'zzz.parquet' df = pd.DataFrame({'x': np.arange(10, dtype=np.int64)}) _write_table(df, path) - table_read = _read_table(path, use_legacy_dataset=use_legacy_dataset) + table_read = _read_table(path) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) @@ -261,13 +241,12 @@ def test_multiple_path_types(tempdir, use_legacy_dataset): path = str(tempdir) + 'zzz.parquet' df = pd.DataFrame({'x': np.arange(10, dtype=np.int64)}) _write_table(df, path) - table_read = _read_table(path, use_legacy_dataset=use_legacy_dataset) + table_read = _read_table(path) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) -@parametrize_legacy_dataset -def test_fspath(tempdir, use_legacy_dataset): +def test_fspath(tempdir): # ARROW-12472 support __fspath__ objects without using str() path = tempdir / "test.parquet" table = pa.table({"a": [1, 2, 3]}) @@ -275,9 +254,7 @@ def test_fspath(tempdir, use_legacy_dataset): fs_protocol_obj = util.FSProtocolClass(path) - result = _read_table( - fs_protocol_obj, use_legacy_dataset=use_legacy_dataset - ) + result = _read_table(fs_protocol_obj) assert result.equals(table) # combined with non-local filesystem raises @@ -285,15 +262,11 @@ def test_fspath(tempdir, use_legacy_dataset): _read_table(fs_protocol_obj, filesystem=FileSystem()) -@pytest.mark.dataset -@parametrize_legacy_dataset @pytest.mark.parametrize("filesystem", [ None, fs.LocalFileSystem(), LocalFileSystem._get_instance() ]) @pytest.mark.parametrize("name", ("data.parquet", "例.parquet")) -def test_relative_paths(tempdir, use_legacy_dataset, filesystem, name): - if use_legacy_dataset and isinstance(filesystem, fs.FileSystem): - pytest.skip("Passing new filesystem not supported for legacy reader") +def test_relative_paths(tempdir, filesystem, name): # reading and writing from relative paths table = pa.table({"a": [1, 2, 3]}) path = tempdir / name @@ -301,8 +274,7 @@ def test_relative_paths(tempdir, use_legacy_dataset, filesystem, name): # reading pq.write_table(table, str(path)) with util.change_cwd(tempdir): - result = pq.read_table(name, filesystem=filesystem, - use_legacy_dataset=use_legacy_dataset) + result = pq.read_table(name, filesystem=filesystem) assert result.equals(table) path.unlink() @@ -334,24 +306,21 @@ def seek(self, *args): pq.read_table(BogusFile(b"")) -@parametrize_legacy_dataset -def test_parquet_read_from_buffer(tempdir, use_legacy_dataset): +def test_parquet_read_from_buffer(tempdir): # reading from a buffer from python's open() table = pa.table({"a": [1, 2, 3]}) pq.write_table(table, str(tempdir / "data.parquet")) with open(str(tempdir / "data.parquet"), "rb") as f: - result = pq.read_table(f, use_legacy_dataset=use_legacy_dataset) + result = pq.read_table(f) assert result.equals(table) with open(str(tempdir / "data.parquet"), "rb") as f: - result = pq.read_table(pa.PythonFile(f), - use_legacy_dataset=use_legacy_dataset) + result = pq.read_table(pa.PythonFile(f)) assert result.equals(table) -@parametrize_legacy_dataset -def test_byte_stream_split(use_legacy_dataset): +def test_byte_stream_split(): # This is only a smoke test. arr_float = pa.array(list(map(float, range(100)))) arr_int = pa.array(list(map(int, range(100)))) @@ -385,12 +354,10 @@ def test_byte_stream_split(use_legacy_dataset): table = pa.Table.from_arrays([arr_int], names=['tmp']) with pytest.raises(IOError): _check_roundtrip(table, expected=table, use_byte_stream_split=True, - use_dictionary=False, - use_legacy_dataset=use_legacy_dataset) + use_dictionary=False) -@parametrize_legacy_dataset -def test_column_encoding(use_legacy_dataset): +def test_column_encoding(): arr_float = pa.array(list(map(float, range(100)))) arr_int = pa.array(list(map(int, range(100)))) arr_bin = pa.array([str(x) for x in range(100)], type=pa.binary()) @@ -406,30 +373,26 @@ def test_column_encoding(use_legacy_dataset): _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, column_encoding={'a': "BYTE_STREAM_SPLIT", 'b': "PLAIN", - 'c': "PLAIN"}, - use_legacy_dataset=use_legacy_dataset) + 'c': "PLAIN"}) # Check "PLAIN" for all columns. _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, - column_encoding="PLAIN", - use_legacy_dataset=use_legacy_dataset) + column_encoding="PLAIN") # Check "DELTA_BINARY_PACKED" for integer columns. _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, column_encoding={'a': "PLAIN", 'b': "DELTA_BINARY_PACKED", - 'c': "PLAIN"}, - use_legacy_dataset=use_legacy_dataset) + 'c': "PLAIN"}) # Check "DELTA_LENGTH_BYTE_ARRAY" for byte columns. _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, column_encoding={'a': "PLAIN", 'b': "DELTA_BINARY_PACKED", - 'c': "DELTA_LENGTH_BYTE_ARRAY"}, - use_legacy_dataset=use_legacy_dataset) + 'c': "DELTA_LENGTH_BYTE_ARRAY"}) # Check "DELTA_BYTE_ARRAY" for byte columns. _check_roundtrip(mixed_table, expected=mixed_table, @@ -437,14 +400,12 @@ def test_column_encoding(use_legacy_dataset): column_encoding={'a': "PLAIN", 'b': "DELTA_BINARY_PACKED", 'c': "DELTA_BYTE_ARRAY", - 'd': "DELTA_BYTE_ARRAY"}, - use_legacy_dataset=use_legacy_dataset) + 'd': "DELTA_BYTE_ARRAY"}) # Check "RLE" for boolean columns. _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, - column_encoding={'e': "RLE"}, - use_legacy_dataset=use_legacy_dataset) + column_encoding={'e': "RLE"}) # Try to pass "BYTE_STREAM_SPLIT" column encoding for integer column 'b'. # This should throw an error as it is only supports FLOAT and DOUBLE. @@ -455,8 +416,7 @@ def test_column_encoding(use_legacy_dataset): use_dictionary=False, column_encoding={'a': "PLAIN", 'b': "BYTE_STREAM_SPLIT", - 'c': "PLAIN"}, - use_legacy_dataset=use_legacy_dataset) + 'c': "PLAIN"}) # Try to pass use "DELTA_BINARY_PACKED" encoding on float column. # This should throw an error as only integers are supported. @@ -465,8 +425,7 @@ def test_column_encoding(use_legacy_dataset): use_dictionary=False, column_encoding={'a': "DELTA_BINARY_PACKED", 'b': "PLAIN", - 'c': "PLAIN"}, - use_legacy_dataset=use_legacy_dataset) + 'c': "PLAIN"}) # Try to pass "RLE_DICTIONARY". # This should throw an error as dictionary encoding is already used by @@ -474,30 +433,26 @@ def test_column_encoding(use_legacy_dataset): with pytest.raises(ValueError): _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, - column_encoding="RLE_DICTIONARY", - use_legacy_dataset=use_legacy_dataset) + column_encoding="RLE_DICTIONARY") # Try to pass unsupported encoding. with pytest.raises(ValueError): _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, - column_encoding={'a': "MADE_UP_ENCODING"}, - use_legacy_dataset=use_legacy_dataset) + column_encoding={'a': "MADE_UP_ENCODING"}) # Try to pass column_encoding and use_dictionary. # This should throw an error. with pytest.raises(ValueError): _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=['b'], - column_encoding={'b': "PLAIN"}, - use_legacy_dataset=use_legacy_dataset) + column_encoding={'b': "PLAIN"}) # Try to pass column_encoding and use_dictionary=True (default value). # This should throw an error. with pytest.raises(ValueError): _check_roundtrip(mixed_table, expected=mixed_table, - column_encoding={'b': "PLAIN"}, - use_legacy_dataset=use_legacy_dataset) + column_encoding={'b': "PLAIN"}) # Try to pass column_encoding and use_byte_stream_split on same column. # This should throw an error. @@ -507,8 +462,7 @@ def test_column_encoding(use_legacy_dataset): use_byte_stream_split=['a'], column_encoding={'a': "RLE", 'b': "BYTE_STREAM_SPLIT", - 'c': "PLAIN"}, - use_legacy_dataset=use_legacy_dataset) + 'c': "PLAIN"}) # Try to pass column_encoding and use_byte_stream_split=True. # This should throw an error. @@ -518,54 +472,45 @@ def test_column_encoding(use_legacy_dataset): use_byte_stream_split=True, column_encoding={'a': "RLE", 'b': "BYTE_STREAM_SPLIT", - 'c': "PLAIN"}, - use_legacy_dataset=use_legacy_dataset) + 'c': "PLAIN"}) # Try to pass column_encoding=True. # This should throw an error. with pytest.raises(TypeError): _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, - column_encoding=True, - use_legacy_dataset=use_legacy_dataset) + column_encoding=True) -@parametrize_legacy_dataset -def test_compression_level(use_legacy_dataset): +def test_compression_level(): arr = pa.array(list(map(int, range(1000)))) data = [arr, arr] table = pa.Table.from_arrays(data, names=['a', 'b']) # Check one compression level. _check_roundtrip(table, expected=table, compression="gzip", - compression_level=1, - use_legacy_dataset=use_legacy_dataset) + compression_level=1) # Check another one to make sure that compression_level=1 does not # coincide with the default one in Arrow. _check_roundtrip(table, expected=table, compression="gzip", - compression_level=5, - use_legacy_dataset=use_legacy_dataset) + compression_level=5) # Check that the user can provide a compression per column _check_roundtrip(table, expected=table, - compression={'a': "gzip", 'b': "snappy"}, - use_legacy_dataset=use_legacy_dataset) + compression={'a': "gzip", 'b': "snappy"}) # Check that the user can provide a compression level per column _check_roundtrip(table, expected=table, compression="gzip", - compression_level={'a': 2, 'b': 3}, - use_legacy_dataset=use_legacy_dataset) + compression_level={'a': 2, 'b': 3}) # Check if both LZ4 compressors are working # (level < 3 -> fast, level >= 3 -> HC) _check_roundtrip(table, expected=table, compression="lz4", - compression_level=1, - use_legacy_dataset=use_legacy_dataset) + compression_level=1) _check_roundtrip(table, expected=table, compression="lz4", - compression_level=9, - use_legacy_dataset=use_legacy_dataset) + compression_level=9) # Check that specifying a compression level for a codec which does allow # specifying one, results into an error. @@ -594,8 +539,7 @@ def test_sanitized_spark_field_names(): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_multithreaded_read(use_legacy_dataset): +def test_multithreaded_read(): df = alltypes_sample(size=10000) table = pa.Table.from_pandas(df) @@ -604,19 +548,16 @@ def test_multithreaded_read(use_legacy_dataset): _write_table(table, buf, compression='SNAPPY', version='2.6') buf.seek(0) - table1 = _read_table( - buf, use_threads=True, use_legacy_dataset=use_legacy_dataset) + table1 = _read_table(buf, use_threads=True) buf.seek(0) - table2 = _read_table( - buf, use_threads=False, use_legacy_dataset=use_legacy_dataset) + table2 = _read_table(buf, use_threads=False) assert table1.equals(table2) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_min_chunksize(use_legacy_dataset): +def test_min_chunksize(): data = pd.DataFrame([np.arange(4)], columns=['A', 'B', 'C', 'D']) table = pa.Table.from_pandas(data.reset_index()) @@ -624,7 +565,7 @@ def test_min_chunksize(use_legacy_dataset): _write_table(table, buf, chunk_size=-1) buf.seek(0) - result = _read_table(buf, use_legacy_dataset=use_legacy_dataset) + result = _read_table(buf) assert result.equals(table) @@ -659,57 +600,46 @@ def test_write_error_deletes_incomplete_file(tempdir): assert not filename.exists() -@parametrize_legacy_dataset -def test_read_non_existent_file(tempdir, use_legacy_dataset): +def test_read_non_existent_file(tempdir): path = 'nonexistent-file.parquet' try: - pq.read_table(path, use_legacy_dataset=use_legacy_dataset) + pq.read_table(path) except Exception as e: assert path in e.args[0] -@parametrize_legacy_dataset -def test_read_table_doesnt_warn(datadir, use_legacy_dataset): - if use_legacy_dataset: - msg = "Passing 'use_legacy_dataset=True'" - with pytest.warns(FutureWarning, match=msg): - pq.read_table(datadir / 'v0.7.1.parquet', - use_legacy_dataset=use_legacy_dataset) - else: - with warnings.catch_warnings(): - warnings.simplefilter(action="error") - pq.read_table(datadir / 'v0.7.1.parquet', - use_legacy_dataset=use_legacy_dataset) +def test_read_table_doesnt_warn(datadir): + with warnings.catch_warnings(): + warnings.simplefilter(action="error") + pq.read_table(datadir / 'v0.7.1.parquet') @pytest.mark.pandas -@parametrize_legacy_dataset -def test_zlib_compression_bug(use_legacy_dataset): +def test_zlib_compression_bug(): # ARROW-3514: "zlib deflate failed, output buffer too small" table = pa.Table.from_arrays([pa.array(['abc', 'def'])], ['some_col']) f = io.BytesIO() pq.write_table(table, f, compression='gzip') f.seek(0) - roundtrip = pq.read_table(f, use_legacy_dataset=use_legacy_dataset) + roundtrip = pq.read_table(f) tm.assert_frame_equal(roundtrip.to_pandas(), table.to_pandas()) -@parametrize_legacy_dataset -def test_parquet_file_too_small(tempdir, use_legacy_dataset): +def test_parquet_file_too_small(tempdir): path = str(tempdir / "test.parquet") # TODO(dataset) with datasets API it raises OSError instead with pytest.raises((pa.ArrowInvalid, OSError), match='size is 0 bytes'): with open(path, 'wb') as f: pass - pq.read_table(path, use_legacy_dataset=use_legacy_dataset) + pq.read_table(path) with pytest.raises((pa.ArrowInvalid, OSError), match='size is 4 bytes'): with open(path, 'wb') as f: f.write(b'ffff') - pq.read_table(path, use_legacy_dataset=use_legacy_dataset) + pq.read_table(path) @pytest.mark.pandas @@ -752,17 +682,15 @@ def test_fastparquet_cross_compatibility(tempdir): tm.assert_frame_equal(table_fp.to_pandas(), df) -@parametrize_legacy_dataset @pytest.mark.parametrize('array_factory', [ lambda: pa.array([0, None] * 10), lambda: pa.array([0, None] * 10).dictionary_encode(), lambda: pa.array(["", None] * 10), lambda: pa.array(["", None] * 10).dictionary_encode(), ]) -@pytest.mark.parametrize('use_dictionary', [False, True]) @pytest.mark.parametrize('read_dictionary', [False, True]) def test_buffer_contents( - array_factory, use_dictionary, read_dictionary, use_legacy_dataset + array_factory, read_dictionary ): # Test that null values are deterministically initialized to zero # after a roundtrip through Parquet. @@ -773,8 +701,7 @@ def test_buffer_contents( bio.seek(0) read_dictionary = ['col'] if read_dictionary else None table = pq.read_table(bio, use_threads=False, - read_dictionary=read_dictionary, - use_legacy_dataset=use_legacy_dataset) + read_dictionary=read_dictionary) for col in table.columns: [chunk] = col.chunks @@ -826,7 +753,6 @@ def test_reads_over_batch(tempdir): assert table == table2 -@pytest.mark.dataset def test_permutation_of_column_order(tempdir): # ARROW-2366 case = tempdir / "dataset_column_order_permutation" @@ -846,18 +772,6 @@ def test_permutation_of_column_order(tempdir): assert table == table2 -def test_read_table_legacy_deprecated(tempdir): - # ARROW-15870 - table = pa.table({'a': [1, 2, 3]}) - path = tempdir / 'data.parquet' - pq.write_table(table, path) - - with pytest.warns( - FutureWarning, match="Passing 'use_legacy_dataset=True'" - ): - pq.read_table(path, use_legacy_dataset=True) - - def test_thrift_size_limits(tempdir): path = tempdir / 'largethrift.parquet' @@ -942,28 +856,9 @@ def test_page_checksum_verification_write_table(tempdir): with pytest.raises(OSError, match="CRC checksum verification"): _ = corrupted_pq_file.read() - # Case 5: Check that enabling page checksum verification in combination - # with legacy dataset raises an exception - with pytest.raises(ValueError, match="page_checksum_verification"): - _ = pq.read_table(corrupted_path, - page_checksum_verification=True, - use_legacy_dataset=True) - @pytest.mark.dataset -@pytest.mark.parametrize( - "use_legacy_dataset", - [ - False, - pytest.param( - True, - marks=pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning" - ), - ), - ], -) -def test_checksum_write_to_dataset(tempdir, use_legacy_dataset): +def test_checksum_write_to_dataset(tempdir): """Check that checksum verification works for datasets created with pq.write_to_dataset""" @@ -973,8 +868,7 @@ def test_checksum_write_to_dataset(tempdir, use_legacy_dataset): original_dir_path = tempdir / 'correct_dir' pq.write_to_dataset(table_orig, original_dir_path, - write_page_checksum=True, - use_legacy_dataset=use_legacy_dataset) + write_page_checksum=True) # Read file and verify that the data is correct original_file_path_list = list(original_dir_path.iterdir()) @@ -1014,3 +908,23 @@ def test_checksum_write_to_dataset(tempdir, use_legacy_dataset): # checksum verification enabled raises an exception with pytest.raises(OSError, match="CRC checksum verification"): _ = pq.read_table(corrupted_file_path, page_checksum_verification=True) + + +@pytest.mark.dataset +def test_deprecated_use_legacy_dataset(tempdir): + # Test that specifying use_legacy_dataset in ParquetDataset, write_to_dataset + # and read_table doesn't raise an error but gives a warning. + table = pa.table({"a": [1, 2, 3]}) + path = tempdir / "deprecate_legacy" + + msg = "Passing 'use_legacy_dataset'" + with pytest.warns(FutureWarning, match=msg): + pq.write_to_dataset(table, path, use_legacy_dataset=False) + + pq.write_to_dataset(table, path) + + with pytest.warns(FutureWarning, match=msg): + pq.read_table(path, use_legacy_dataset=False) + + with pytest.warns(FutureWarning, match=msg): + pq.ParquetDataset(path, use_legacy_dataset=False) diff --git a/python/pyarrow/tests/parquet/test_compliant_nested_type.py b/python/pyarrow/tests/parquet/test_compliant_nested_type.py index ca1ad7ee32255..2345855a3321b 100644 --- a/python/pyarrow/tests/parquet/test_compliant_nested_type.py +++ b/python/pyarrow/tests/parquet/test_compliant_nested_type.py @@ -18,7 +18,6 @@ import pytest import pyarrow as pa -from pyarrow.tests.parquet.common import parametrize_legacy_dataset try: import pyarrow.parquet as pq @@ -58,16 +57,13 @@ @pytest.mark.pandas -@parametrize_legacy_dataset @parametrize_test_data -def test_write_compliant_nested_type_enable(tempdir, - use_legacy_dataset, test_data): +def test_write_compliant_nested_type_enable(tempdir, test_data): # prepare dataframe for testing df = pd.DataFrame(data=test_data) # verify that we can read/write pandas df with new flag (default behaviour) _roundtrip_pandas_dataframe(df, - write_kwargs={}, - use_legacy_dataset=use_legacy_dataset) + write_kwargs={}) # Write to a parquet file with compliant nested type table = pa.Table.from_pandas(df, preserve_index=False) @@ -83,21 +79,17 @@ def test_write_compliant_nested_type_enable(tempdir, assert new_table.schema.types[0].value_field.name == 'element' # Verify that the new table can be read/written correctly - _check_roundtrip(new_table, - use_legacy_dataset=use_legacy_dataset) + _check_roundtrip(new_table) @pytest.mark.pandas -@parametrize_legacy_dataset @parametrize_test_data -def test_write_compliant_nested_type_disable(tempdir, - use_legacy_dataset, test_data): +def test_write_compliant_nested_type_disable(tempdir, test_data): # prepare dataframe for testing df = pd.DataFrame(data=test_data) # verify that we can read/write with new flag disabled _roundtrip_pandas_dataframe(df, write_kwargs={ - 'use_compliant_nested_type': False}, - use_legacy_dataset=use_legacy_dataset) + 'use_compliant_nested_type': False}) # Write to a parquet file while disabling compliant nested type table = pa.Table.from_pandas(df, preserve_index=False) @@ -114,5 +106,4 @@ def test_write_compliant_nested_type_disable(tempdir, # Verify that the new table can be read/written correctly _check_roundtrip(new_table, - use_legacy_dataset=use_legacy_dataset, use_compliant_nested_type=False) diff --git a/python/pyarrow/tests/parquet/test_data_types.py b/python/pyarrow/tests/parquet/test_data_types.py index 32fe128bbae9b..e6b66b00428fb 100644 --- a/python/pyarrow/tests/parquet/test_data_types.py +++ b/python/pyarrow/tests/parquet/test_data_types.py @@ -23,8 +23,7 @@ import pyarrow as pa from pyarrow.tests import util -from pyarrow.tests.parquet.common import (_check_roundtrip, - parametrize_legacy_dataset) +from pyarrow.tests.parquet.common import _check_roundtrip try: import pyarrow.parquet as pq @@ -54,9 +53,8 @@ @pytest.mark.pandas -@parametrize_legacy_dataset @pytest.mark.parametrize('chunk_size', [None, 1000]) -def test_parquet_2_0_roundtrip(tempdir, chunk_size, use_legacy_dataset): +def test_parquet_2_0_roundtrip(tempdir, chunk_size): df = alltypes_sample(size=10000, categorical=True) filename = tempdir / 'pandas_roundtrip.parquet' @@ -65,8 +63,7 @@ def test_parquet_2_0_roundtrip(tempdir, chunk_size, use_legacy_dataset): _write_table(arrow_table, filename, version='2.6', chunk_size=chunk_size) - table_read = pq.read_pandas( - filename, use_legacy_dataset=use_legacy_dataset) + table_read = pq.read_pandas(filename) assert table_read.schema.pandas_metadata is not None read_metadata = table_read.schema.metadata @@ -77,8 +74,7 @@ def test_parquet_2_0_roundtrip(tempdir, chunk_size, use_legacy_dataset): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_parquet_1_0_roundtrip(tempdir, use_legacy_dataset): +def test_parquet_1_0_roundtrip(tempdir): size = 10000 np.random.seed(0) df = pd.DataFrame({ @@ -100,7 +96,7 @@ def test_parquet_1_0_roundtrip(tempdir, use_legacy_dataset): filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df) _write_table(arrow_table, filename, version='1.0') - table_read = _read_table(filename, use_legacy_dataset=use_legacy_dataset) + table_read = _read_table(filename) df_read = table_read.to_pandas() # We pass uint32_t as int64_t if we write Parquet version 1.0 @@ -113,18 +109,17 @@ def test_parquet_1_0_roundtrip(tempdir, use_legacy_dataset): # ----------------------------------------------------------------------------- -def _simple_table_write_read(table, use_legacy_dataset): +def _simple_table_write_read(table): bio = pa.BufferOutputStream() pq.write_table(table, bio) contents = bio.getvalue() return pq.read_table( - pa.BufferReader(contents), use_legacy_dataset=use_legacy_dataset + pa.BufferReader(contents) ) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_direct_read_dictionary(use_legacy_dataset): +def test_direct_read_dictionary(): # ARROW-3325 repeats = 10 nunique = 5 @@ -140,8 +135,7 @@ def test_direct_read_dictionary(use_legacy_dataset): contents = bio.getvalue() result = pq.read_table(pa.BufferReader(contents), - read_dictionary=['f0'], - use_legacy_dataset=use_legacy_dataset) + read_dictionary=['f0']) # Compute dictionary-encoded subfield expected = pa.table([table[0].dictionary_encode()], names=['f0']) @@ -149,8 +143,7 @@ def test_direct_read_dictionary(use_legacy_dataset): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_direct_read_dictionary_subfield(use_legacy_dataset): +def test_direct_read_dictionary_subfield(): repeats = 10 nunique = 5 @@ -163,8 +156,7 @@ def test_direct_read_dictionary_subfield(use_legacy_dataset): pq.write_table(table, bio) contents = bio.getvalue() result = pq.read_table(pa.BufferReader(contents), - read_dictionary=['f0.list.element'], - use_legacy_dataset=use_legacy_dataset) + read_dictionary=['f0.list.element']) arr = pa.array(data[0]) values_as_dict = arr.values.dictionary_encode() @@ -181,8 +173,7 @@ def test_direct_read_dictionary_subfield(use_legacy_dataset): assert result[0].num_chunks == 1 -@parametrize_legacy_dataset -def test_dictionary_array_automatically_read(use_legacy_dataset): +def test_dictionary_array_automatically_read(): # ARROW-3246 # Make a large dictionary, a little over 4MB of data @@ -200,7 +191,7 @@ def test_dictionary_array_automatically_read(use_legacy_dataset): dict_values)) table = pa.table([pa.chunked_array(chunks)], names=['f0']) - result = _simple_table_write_read(table, use_legacy_dataset) + result = _simple_table_write_read(table) assert result.equals(table) @@ -213,8 +204,7 @@ def test_dictionary_array_automatically_read(use_legacy_dataset): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_decimal_roundtrip(tempdir, use_legacy_dataset): +def test_decimal_roundtrip(tempdir): num_values = 10 columns = {} @@ -234,8 +224,7 @@ def test_decimal_roundtrip(tempdir, use_legacy_dataset): string_filename = str(filename) table = pa.Table.from_pandas(expected) _write_table(table, string_filename) - result_table = _read_table( - string_filename, use_legacy_dataset=use_legacy_dataset) + result_table = _read_table(string_filename) result = result_table.to_pandas() tm.assert_frame_equal(result, expected) @@ -259,14 +248,13 @@ def test_decimal_roundtrip_negative_scale(tempdir): # ----------------------------------------------------------------------------- -@parametrize_legacy_dataset @pytest.mark.parametrize('dtype', [int, float]) -def test_single_pylist_column_roundtrip(tempdir, dtype, use_legacy_dataset): +def test_single_pylist_column_roundtrip(tempdir, dtype,): filename = tempdir / 'single_{}_column.parquet'.format(dtype.__name__) data = [pa.array(list(map(dtype, range(5))))] table = pa.Table.from_arrays(data, names=['a']) _write_table(table, filename) - table_read = _read_table(filename, use_legacy_dataset=use_legacy_dataset) + table_read = _read_table(filename) for i in range(table.num_columns): col_written = table[i] col_read = table_read[i] @@ -277,16 +265,14 @@ def test_single_pylist_column_roundtrip(tempdir, dtype, use_legacy_dataset): assert data_written.equals(data_read) -@parametrize_legacy_dataset -def test_empty_lists_table_roundtrip(use_legacy_dataset): +def test_empty_lists_table_roundtrip(): # ARROW-2744: Shouldn't crash when writing an array of empty lists arr = pa.array([[], []], type=pa.list_(pa.int32())) table = pa.Table.from_arrays([arr], ["A"]) - _check_roundtrip(table, use_legacy_dataset=use_legacy_dataset) + _check_roundtrip(table) -@parametrize_legacy_dataset -def test_nested_list_nonnullable_roundtrip_bug(use_legacy_dataset): +def test_nested_list_nonnullable_roundtrip_bug(): # Reproduce failure in ARROW-5630 typ = pa.list_(pa.field("item", pa.float32(), False)) num_rows = 10000 @@ -295,26 +281,22 @@ def test_nested_list_nonnullable_roundtrip_bug(use_legacy_dataset): (num_rows // 10)), type=typ) ], ['a']) _check_roundtrip( - t, data_page_size=4096, use_legacy_dataset=use_legacy_dataset) + t, data_page_size=4096) -@parametrize_legacy_dataset -def test_nested_list_struct_multiple_batches_roundtrip( - tempdir, use_legacy_dataset -): +def test_nested_list_struct_multiple_batches_roundtrip(tempdir): # Reproduce failure in ARROW-11024 data = [[{'x': 'abc', 'y': 'abc'}]]*100 + [[{'x': 'abc', 'y': 'gcb'}]]*100 table = pa.table([pa.array(data)], names=['column']) _check_roundtrip( - table, row_group_size=20, use_legacy_dataset=use_legacy_dataset) + table, row_group_size=20) # Reproduce failure in ARROW-11069 (plain non-nested structs with strings) data = pa.array( [{'a': '1', 'b': '2'}, {'a': '3', 'b': '4'}, {'a': '5', 'b': '6'}]*10 ) table = pa.table({'column': data}) - _check_roundtrip( - table, row_group_size=10, use_legacy_dataset=use_legacy_dataset) + _check_roundtrip(table, row_group_size=10) def test_writing_empty_lists(): @@ -366,8 +348,7 @@ def test_large_list_records(): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_parquet_nested_convenience(tempdir, use_legacy_dataset): +def test_parquet_nested_convenience(tempdir): # ARROW-1684 df = pd.DataFrame({ 'a': [[1, 2, 3], None, [4, 5], []], @@ -380,11 +361,11 @@ def test_parquet_nested_convenience(tempdir, use_legacy_dataset): _write_table(table, path) read = pq.read_table( - path, columns=['a'], use_legacy_dataset=use_legacy_dataset) + path, columns=['a']) tm.assert_frame_equal(read.to_pandas(), df[['a']]) read = pq.read_table( - path, columns=['a', 'b'], use_legacy_dataset=use_legacy_dataset) + path, columns=['a', 'b']) tm.assert_frame_equal(read.to_pandas(), df) @@ -420,17 +401,16 @@ def test_large_table_int32_overflow(): _write_table(table, f) -def _simple_table_roundtrip(table, use_legacy_dataset=False, **write_kwargs): +def _simple_table_roundtrip(table, **write_kwargs): stream = pa.BufferOutputStream() _write_table(table, stream, **write_kwargs) buf = stream.getvalue() - return _read_table(buf, use_legacy_dataset=use_legacy_dataset) + return _read_table(buf) @pytest.mark.slow @pytest.mark.large_memory -@parametrize_legacy_dataset -def test_byte_array_exactly_2gb(use_legacy_dataset): +def test_byte_array_exactly_2gb(): # Test edge case reported in ARROW-3762 val = b'x' * (1 << 10) @@ -444,15 +424,14 @@ def test_byte_array_exactly_2gb(use_legacy_dataset): values = pa.chunked_array([base, pa.array(case)]) t = pa.table([values], names=['f0']) result = _simple_table_roundtrip( - t, use_legacy_dataset=use_legacy_dataset, use_dictionary=False) + t, use_dictionary=False) assert t.equals(result) @pytest.mark.slow @pytest.mark.pandas @pytest.mark.large_memory -@parametrize_legacy_dataset -def test_binary_array_overflow_to_chunked(use_legacy_dataset): +def test_binary_array_overflow_to_chunked(): # ARROW-3762 # 2^31 + 1 bytes @@ -462,8 +441,7 @@ def test_binary_array_overflow_to_chunked(use_legacy_dataset): df = pd.DataFrame({'byte_col': values}) tbl = pa.Table.from_pandas(df, preserve_index=False) - read_tbl = _simple_table_roundtrip( - tbl, use_legacy_dataset=use_legacy_dataset) + read_tbl = _simple_table_roundtrip(tbl) col0_data = read_tbl[0] assert isinstance(col0_data, pa.ChunkedArray) @@ -477,8 +455,7 @@ def test_binary_array_overflow_to_chunked(use_legacy_dataset): @pytest.mark.slow @pytest.mark.pandas @pytest.mark.large_memory -@parametrize_legacy_dataset -def test_list_of_binary_large_cell(use_legacy_dataset): +def test_list_of_binary_large_cell(): # ARROW-4688 data = [] @@ -491,8 +468,7 @@ def test_list_of_binary_large_cell(use_legacy_dataset): arr = pa.array(data) table = pa.Table.from_arrays([arr], ['chunky_cells']) - read_table = _simple_table_roundtrip( - table, use_legacy_dataset=use_legacy_dataset) + read_table = _simple_table_roundtrip(table) assert table.equals(read_table) diff --git a/python/pyarrow/tests/parquet/test_dataset.py b/python/pyarrow/tests/parquet/test_dataset.py index a9e99d5d65cf9..b6e351bdef9a7 100644 --- a/python/pyarrow/tests/parquet/test_dataset.py +++ b/python/pyarrow/tests/parquet/test_dataset.py @@ -29,9 +29,6 @@ from pyarrow import fs from pyarrow.filesystem import LocalFileSystem from pyarrow.tests import util -from pyarrow.tests.parquet.common import ( - parametrize_legacy_dataset, parametrize_legacy_dataset_fixed, - parametrize_legacy_dataset_not_supported) from pyarrow.util import guid from pyarrow.vendored.version import Version @@ -53,76 +50,10 @@ # Marks all of the tests in this module # Ignore these with pytest ... -m 'not parquet' -pytestmark = pytest.mark.parquet +pytestmark = [pytest.mark.parquet, pytest.mark.dataset] -@pytest.mark.pandas -def test_parquet_piece_read(tempdir): - df = _test_dataframe(1000) - table = pa.Table.from_pandas(df) - - path = tempdir / 'parquet_piece_read.parquet' - _write_table(table, path, version='2.6') - - with pytest.warns(FutureWarning): - piece1 = pq.ParquetDatasetPiece(path) - - result = piece1.read() - assert result.equals(table) - - -@pytest.mark.pandas -def test_parquet_piece_open_and_get_metadata(tempdir): - df = _test_dataframe(100) - table = pa.Table.from_pandas(df) - - path = tempdir / 'parquet_piece_read.parquet' - _write_table(table, path, version='2.6') - - with pytest.warns(FutureWarning): - piece = pq.ParquetDatasetPiece(path) - - table1 = piece.read() - assert isinstance(table1, pa.Table) - meta1 = piece.get_metadata() - assert isinstance(meta1, pq.FileMetaData) - - assert table.equals(table1) - - -@pytest.mark.filterwarnings("ignore:ParquetDatasetPiece:FutureWarning") -def test_parquet_piece_basics(): - path = '/baz.parq' - - piece1 = pq.ParquetDatasetPiece(path) - piece2 = pq.ParquetDatasetPiece(path, row_group=1) - piece3 = pq.ParquetDatasetPiece( - path, row_group=1, partition_keys=[('foo', 0), ('bar', 1)]) - - assert str(piece1) == path - assert str(piece2) == '/baz.parq | row_group=1' - assert str(piece3) == 'partition[foo=0, bar=1] /baz.parq | row_group=1' - - assert piece1 == piece1 - assert piece2 == piece2 - assert piece3 == piece3 - assert piece1 != piece3 - - -def test_partition_set_dictionary_type(): - set1 = pq.PartitionSet('key1', ['foo', 'bar', 'baz']) - set2 = pq.PartitionSet('key2', [2007, 2008, 2009]) - - assert isinstance(set1.dictionary, pa.StringArray) - assert isinstance(set2.dictionary, pa.IntegerArray) - - set3 = pq.PartitionSet('key2', [datetime.datetime(2007, 1, 1)]) - with pytest.raises(TypeError): - set3.dictionary - - -@parametrize_legacy_dataset_fixed -def test_filesystem_uri(tempdir, use_legacy_dataset): +def test_filesystem_uri(tempdir): table = pa.table({"a": [1, 2, 3]}) directory = tempdir / "data_dir" @@ -132,72 +63,36 @@ def test_filesystem_uri(tempdir, use_legacy_dataset): # filesystem object result = pq.read_table( - path, filesystem=fs.LocalFileSystem(), - use_legacy_dataset=use_legacy_dataset) + path, filesystem=fs.LocalFileSystem()) assert result.equals(table) # filesystem URI result = pq.read_table( - "data_dir/data.parquet", filesystem=util._filesystem_uri(tempdir), - use_legacy_dataset=use_legacy_dataset) + "data_dir/data.parquet", filesystem=util._filesystem_uri(tempdir)) assert result.equals(table) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_read_partitioned_directory(tempdir, use_legacy_dataset): +def test_read_partitioned_directory(tempdir): fs = LocalFileSystem._get_instance() - _partition_test_for_filesystem(fs, tempdir, use_legacy_dataset) + _partition_test_for_filesystem(fs, tempdir) -@pytest.mark.filterwarnings("ignore:'ParquetDataset:FutureWarning") @pytest.mark.pandas -def test_create_parquet_dataset_multi_threaded(tempdir): - fs = LocalFileSystem._get_instance() - base_path = tempdir - - _partition_test_for_filesystem(fs, base_path) - - manifest = pq.ParquetManifest(base_path, filesystem=fs, - metadata_nthreads=1) - with pytest.warns( - FutureWarning, match="Specifying the 'metadata_nthreads'" - ): - dataset = pq.ParquetDataset( - base_path, filesystem=fs, metadata_nthreads=16, - use_legacy_dataset=True - ) - assert len(dataset.pieces) > 0 - partitions = dataset.partitions - assert len(partitions.partition_names) > 0 - assert partitions.partition_names == manifest.partitions.partition_names - assert len(partitions.levels) == len(manifest.partitions.levels) - - -@pytest.mark.pandas -@parametrize_legacy_dataset -def test_read_partitioned_columns_selection(tempdir, use_legacy_dataset): +def test_read_partitioned_columns_selection(tempdir): # ARROW-3861 - do not include partition columns in resulting table when # `columns` keyword was passed without those columns fs = LocalFileSystem._get_instance() base_path = tempdir _partition_test_for_filesystem(fs, base_path) - dataset = pq.ParquetDataset( - base_path, use_legacy_dataset=use_legacy_dataset) + dataset = pq.ParquetDataset(base_path) result = dataset.read(columns=["values"]) - if use_legacy_dataset: - # ParquetDataset implementation always includes the partition columns - # automatically, and we can't easily "fix" this since dask relies on - # this behaviour (ARROW-8644) - assert result.column_names == ["values", "foo", "bar"] - else: - assert result.column_names == ["values"] + assert result.column_names == ["values"] @pytest.mark.pandas -@parametrize_legacy_dataset -def test_filters_equivalency(tempdir, use_legacy_dataset): +def test_filters_equivalency(tempdir): fs = LocalFileSystem._get_instance() base_path = tempdir @@ -225,7 +120,6 @@ def test_filters_equivalency(tempdir, use_legacy_dataset): base_path, filesystem=fs, filters=[('integer', '=', 1), ('string', '!=', 'b'), ('boolean', '==', 'True')], - use_legacy_dataset=use_legacy_dataset, ) table = dataset.read() result_df = (table.to_pandas().reset_index(drop=True)) @@ -247,8 +141,7 @@ def test_filters_equivalency(tempdir, use_legacy_dataset): [('integer', '=', 0), ('boolean', '==', 'False')] ] dataset = pq.ParquetDataset( - base_path, filesystem=fs, filters=filters, - use_legacy_dataset=use_legacy_dataset) + base_path, filesystem=fs, filters=filters) table = dataset.read() result_df = table.to_pandas().reset_index(drop=True) @@ -262,30 +155,15 @@ def test_filters_equivalency(tempdir, use_legacy_dataset): assert df_filter_2.sum() > 0 assert result_df.shape[0] == (df_filter_1.sum() + df_filter_2.sum()) - if use_legacy_dataset: - # Check for \0 in predicate values. Until they are correctly - # implemented in ARROW-3391, they would otherwise lead to weird - # results with the current code. - with pytest.raises(NotImplementedError): - filters = [[('string', '==', b'1\0a')]] - pq.ParquetDataset(base_path, filesystem=fs, filters=filters, - use_legacy_dataset=True) - with pytest.raises(NotImplementedError): - filters = [[('string', '==', '1\0a')]] - pq.ParquetDataset(base_path, filesystem=fs, filters=filters, - use_legacy_dataset=True) - else: - for filters in [[[('string', '==', b'1\0a')]], - [[('string', '==', '1\0a')]]]: - dataset = pq.ParquetDataset( - base_path, filesystem=fs, filters=filters, - use_legacy_dataset=False) - assert dataset.read().num_rows == 0 + for filters in [[[('string', '==', b'1\0a')]], + [[('string', '==', '1\0a')]]]: + dataset = pq.ParquetDataset( + base_path, filesystem=fs, filters=filters) + assert dataset.read().num_rows == 0 @pytest.mark.pandas -@parametrize_legacy_dataset -def test_filters_cutoff_exclusive_integer(tempdir, use_legacy_dataset): +def test_filters_cutoff_exclusive_integer(tempdir): fs = LocalFileSystem._get_instance() base_path = tempdir @@ -308,7 +186,6 @@ def test_filters_cutoff_exclusive_integer(tempdir, use_legacy_dataset): ('integers', '<', 4), ('integers', '>', 1), ], - use_legacy_dataset=use_legacy_dataset ) table = dataset.read() result_df = (table.to_pandas() @@ -319,15 +196,14 @@ def test_filters_cutoff_exclusive_integer(tempdir, use_legacy_dataset): assert result_list == [2, 3] -@pytest.mark.pandas -@parametrize_legacy_dataset @pytest.mark.xfail( # different error with use_legacy_datasets because result_df is no longer # categorical raises=(TypeError, AssertionError), reason='Loss of type information in creation of categoricals.' ) -def test_filters_cutoff_exclusive_datetime(tempdir, use_legacy_dataset): +@pytest.mark.pandas +def test_filters_cutoff_exclusive_datetime(tempdir): fs = LocalFileSystem._get_instance() base_path = tempdir @@ -356,7 +232,6 @@ def test_filters_cutoff_exclusive_datetime(tempdir, use_legacy_dataset): ('dates', '<', "2018-04-12"), ('dates', '>', "2018-04-10") ], - use_legacy_dataset=use_legacy_dataset ) table = dataset.read() result_df = (table.to_pandas() @@ -371,7 +246,6 @@ def test_filters_cutoff_exclusive_datetime(tempdir, use_legacy_dataset): @pytest.mark.pandas -@pytest.mark.dataset def test_filters_inclusive_datetime(tempdir): # ARROW-11480 path = tempdir / 'timestamps.parquet' @@ -389,8 +263,7 @@ def test_filters_inclusive_datetime(tempdir): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_filters_inclusive_integer(tempdir, use_legacy_dataset): +def test_filters_inclusive_integer(tempdir): fs = LocalFileSystem._get_instance() base_path = tempdir @@ -413,7 +286,6 @@ def test_filters_inclusive_integer(tempdir, use_legacy_dataset): ('integers', '<=', 3), ('integers', '>=', 2), ], - use_legacy_dataset=use_legacy_dataset ) table = dataset.read() result_df = (table.to_pandas() @@ -425,8 +297,7 @@ def test_filters_inclusive_integer(tempdir, use_legacy_dataset): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_filters_inclusive_set(tempdir, use_legacy_dataset): +def test_filters_inclusive_set(tempdir): fs = LocalFileSystem._get_instance() base_path = tempdir @@ -451,7 +322,6 @@ def test_filters_inclusive_set(tempdir, use_legacy_dataset): dataset = pq.ParquetDataset( base_path, filesystem=fs, filters=[('string', 'in', 'ab')], - use_legacy_dataset=use_legacy_dataset ) table = dataset.read() result_df = (table.to_pandas().reset_index(drop=True)) @@ -464,7 +334,6 @@ def test_filters_inclusive_set(tempdir, use_legacy_dataset): base_path, filesystem=fs, filters=[('integer', 'in', [1]), ('string', 'in', ('a', 'b')), ('boolean', 'not in', {'False'})], - use_legacy_dataset=use_legacy_dataset ) table = dataset.read() result_df = (table.to_pandas().reset_index(drop=True)) @@ -475,8 +344,7 @@ def test_filters_inclusive_set(tempdir, use_legacy_dataset): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_filters_invalid_pred_op(tempdir, use_legacy_dataset): +def test_filters_invalid_pred_op(tempdir): fs = LocalFileSystem._get_instance() base_path = tempdir @@ -496,49 +364,30 @@ def test_filters_invalid_pred_op(tempdir, use_legacy_dataset): with pytest.raises(TypeError): pq.ParquetDataset(base_path, filesystem=fs, - filters=[('integers', 'in', 3), ], - use_legacy_dataset=use_legacy_dataset) + filters=[('integers', 'in', 3), ]) with pytest.raises(ValueError): pq.ParquetDataset(base_path, filesystem=fs, - filters=[('integers', '=<', 3), ], - use_legacy_dataset=use_legacy_dataset) - - if use_legacy_dataset: - with pytest.raises(ValueError): - pq.ParquetDataset(base_path, - filesystem=fs, - filters=[('integers', 'in', set()), ], - use_legacy_dataset=use_legacy_dataset) - else: - # Dataset API returns empty table instead - dataset = pq.ParquetDataset(base_path, - filesystem=fs, - filters=[('integers', 'in', set()), ], - use_legacy_dataset=use_legacy_dataset) - assert dataset.read().num_rows == 0 + filters=[('integers', '=<', 3), ]) - if use_legacy_dataset: - with pytest.raises(ValueError): - pq.ParquetDataset(base_path, - filesystem=fs, - filters=[('integers', '!=', {3})], - use_legacy_dataset=use_legacy_dataset) - else: - dataset = pq.ParquetDataset(base_path, - filesystem=fs, - filters=[('integers', '!=', {3})], - use_legacy_dataset=use_legacy_dataset) - with pytest.raises(NotImplementedError): - assert dataset.read().num_rows == 0 + # Dataset API returns empty table + dataset = pq.ParquetDataset(base_path, + filesystem=fs, + filters=[('integers', 'in', set()), ]) + assert dataset.read().num_rows == 0 + + dataset = pq.ParquetDataset(base_path, + filesystem=fs, + filters=[('integers', '!=', {3})]) + with pytest.raises(NotImplementedError): + assert dataset.read().num_rows == 0 @pytest.mark.pandas -@parametrize_legacy_dataset_fixed -def test_filters_invalid_column(tempdir, use_legacy_dataset): +def test_filters_invalid_column(tempdir): # ARROW-5572 - raise error on invalid name in filter specification - # works with new dataset / xfail with legacy implementation + # works with new dataset fs = LocalFileSystem._get_instance() base_path = tempdir @@ -556,12 +405,10 @@ def test_filters_invalid_column(tempdir, use_legacy_dataset): msg = r"No match for FieldRef.Name\(non_existent_column\)" with pytest.raises(ValueError, match=msg): pq.ParquetDataset(base_path, filesystem=fs, - filters=[('non_existent_column', '<', 3), ], - use_legacy_dataset=use_legacy_dataset).read() + filters=[('non_existent_column', '<', 3), ]).read() @pytest.mark.pandas -@parametrize_legacy_dataset @pytest.mark.parametrize("filters", ([('integers', '<', 3)], [[('integers', '<', 3)]], @@ -569,7 +416,7 @@ def test_filters_invalid_column(tempdir, use_legacy_dataset): pc.field('nested', 'a') < 3, pc.field('nested', 'b').cast(pa.int64()) < 3)) @pytest.mark.parametrize("read_method", ("read_table", "read_pandas")) -def test_filters_read_table(tempdir, use_legacy_dataset, filters, read_method): +def test_filters_read_table(tempdir, filters, read_method): read = getattr(pq, read_method) # test that filters keyword is passed through in read_table fs = LocalFileSystem._get_instance() @@ -589,24 +436,15 @@ def test_filters_read_table(tempdir, use_legacy_dataset, filters, read_method): _generate_partition_directories(fs, base_path, partition_spec, df) - kwargs = dict(filesystem=fs, filters=filters, - use_legacy_dataset=use_legacy_dataset) + kwargs = dict(filesystem=fs, filters=filters) - # Using Expression in legacy dataset not supported - if use_legacy_dataset and isinstance(filters, pc.Expression): - msg = "Expressions as filter not supported for legacy dataset" - with pytest.raises(TypeError, match=msg): - read(base_path, **kwargs) - else: - table = read(base_path, **kwargs) - assert table.num_rows == 3 + table = read(base_path, **kwargs) + assert table.num_rows == 3 @pytest.mark.pandas -@parametrize_legacy_dataset_fixed -def test_partition_keys_with_underscores(tempdir, use_legacy_dataset): +def test_partition_keys_with_underscores(tempdir): # ARROW-5666 - partition field values with underscores preserve underscores - # xfail with legacy dataset -> they get interpreted as integers fs = LocalFileSystem._get_instance() base_path = tempdir @@ -623,60 +461,47 @@ def test_partition_keys_with_underscores(tempdir, use_legacy_dataset): _generate_partition_directories(fs, base_path, partition_spec, df) - dataset = pq.ParquetDataset( - base_path, use_legacy_dataset=use_legacy_dataset) + dataset = pq.ParquetDataset(base_path) result = dataset.read() assert result.column("year_week").to_pylist() == string_keys @pytest.mark.s3 -@parametrize_legacy_dataset -def test_read_s3fs(s3_example_s3fs, use_legacy_dataset): +def test_read_s3fs(s3_example_s3fs, ): fs, path = s3_example_s3fs path = path + "/test.parquet" table = pa.table({"a": [1, 2, 3]}) _write_table(table, path, filesystem=fs) - result = _read_table( - path, filesystem=fs, use_legacy_dataset=use_legacy_dataset - ) + result = _read_table(path, filesystem=fs) assert result.equals(table) @pytest.mark.s3 -@parametrize_legacy_dataset -def test_read_directory_s3fs(s3_example_s3fs, use_legacy_dataset): +def test_read_directory_s3fs(s3_example_s3fs): fs, directory = s3_example_s3fs path = directory + "/test.parquet" table = pa.table({"a": [1, 2, 3]}) _write_table(table, path, filesystem=fs) - result = _read_table( - directory, filesystem=fs, use_legacy_dataset=use_legacy_dataset - ) + result = _read_table(directory, filesystem=fs) assert result.equals(table) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_read_single_file_list(tempdir, use_legacy_dataset): +def test_read_single_file_list(tempdir): data_path = str(tempdir / 'data.parquet') table = pa.table({"a": [1, 2, 3]}) _write_table(table, data_path) - result = pq.ParquetDataset( - [data_path], use_legacy_dataset=use_legacy_dataset - ).read() + result = pq.ParquetDataset([data_path]).read() assert result.equals(table) @pytest.mark.pandas @pytest.mark.s3 -@parametrize_legacy_dataset -def test_read_partitioned_directory_s3fs_wrapper( - s3_example_s3fs, use_legacy_dataset -): +def test_read_partitioned_directory_s3fs_wrapper(s3_example_s3fs): import s3fs from pyarrow.filesystem import S3FSWrapper @@ -690,23 +515,18 @@ def test_read_partitioned_directory_s3fs_wrapper( _partition_test_for_filesystem(wrapper, path) # Check that we can auto-wrap - dataset = pq.ParquetDataset( - path, filesystem=fs, use_legacy_dataset=use_legacy_dataset - ) + dataset = pq.ParquetDataset(path, filesystem=fs) dataset.read() @pytest.mark.pandas @pytest.mark.s3 -@parametrize_legacy_dataset -def test_read_partitioned_directory_s3fs(s3_example_s3fs, use_legacy_dataset): +def test_read_partitioned_directory_s3fs(s3_example_s3fs): fs, path = s3_example_s3fs - _partition_test_for_filesystem( - fs, path, use_legacy_dataset=use_legacy_dataset - ) + _partition_test_for_filesystem(fs, path) -def _partition_test_for_filesystem(fs, base_path, use_legacy_dataset=True): +def _partition_test_for_filesystem(fs, base_path): foo_keys = [0, 1] bar_keys = ['a', 'b', 'c'] partition_spec = [ @@ -724,8 +544,7 @@ def _partition_test_for_filesystem(fs, base_path, use_legacy_dataset=True): _generate_partition_directories(fs, base_path, partition_spec, df) - dataset = pq.ParquetDataset( - base_path, filesystem=fs, use_legacy_dataset=use_legacy_dataset) + dataset = pq.ParquetDataset(base_path, filesystem=fs) table = dataset.read() result_df = (table.to_pandas() .sort_values(by='index') @@ -735,15 +554,11 @@ def _partition_test_for_filesystem(fs, base_path, use_legacy_dataset=True): .reset_index(drop=True) .reindex(columns=result_df.columns)) - if use_legacy_dataset or Version(pd.__version__) < Version("2.0.0"): - expected_df['foo'] = pd.Categorical(df['foo'], categories=foo_keys) - expected_df['bar'] = pd.Categorical(df['bar'], categories=bar_keys) - else: - # With pandas 2.0.0 Index can store all numeric dtypes (not just - # int64/uint64/float64). Using astype() to create a categorical - # column preserves original dtype (int32) - expected_df['foo'] = expected_df['foo'].astype("category") - expected_df['bar'] = expected_df['bar'].astype("category") + # With pandas 2.0.0 Index can store all numeric dtypes (not just + # int64/uint64/float64). Using astype() to create a categorical + # column preserves original dtype (int32) + expected_df['foo'] = expected_df['foo'].astype("category") + expected_df['bar'] = expected_df['bar'].astype("category") assert (result_df.columns == ['index', 'values', 'foo', 'bar']).all() @@ -790,83 +605,6 @@ def _visit_level(base_dir, level, part_keys): _visit_level(base_dir, 0, []) -def _test_read_common_metadata_files(fs, base_path): - import pandas as pd - - import pyarrow.parquet as pq - - N = 100 - df = pd.DataFrame({ - 'index': np.arange(N), - 'values': np.random.randn(N) - }, columns=['index', 'values']) - - base_path = str(base_path) - data_path = os.path.join(base_path, 'data.parquet') - - table = pa.Table.from_pandas(df) - - with fs.open(data_path, 'wb') as f: - _write_table(table, f) - - metadata_path = os.path.join(base_path, '_common_metadata') - with fs.open(metadata_path, 'wb') as f: - pq.write_metadata(table.schema, f) - - dataset = pq.ParquetDataset(base_path, filesystem=fs, - use_legacy_dataset=True) - with pytest.warns(FutureWarning): - assert dataset.common_metadata_path == str(metadata_path) - - with fs.open(data_path) as f: - common_schema = pq.read_metadata(f).schema - assert dataset.schema.equals(common_schema) - - # handle list of one directory - dataset2 = pq.ParquetDataset([base_path], filesystem=fs, - use_legacy_dataset=True) - assert dataset2.schema.equals(dataset.schema) - - -@pytest.mark.pandas -@pytest.mark.filterwarnings("ignore:'ParquetDataset.schema:FutureWarning") -def test_read_common_metadata_files(tempdir): - fs = LocalFileSystem._get_instance() - _test_read_common_metadata_files(fs, tempdir) - - -@pytest.mark.pandas -@pytest.mark.filterwarnings("ignore:'ParquetDataset.schema:FutureWarning") -def test_read_metadata_files(tempdir): - fs = LocalFileSystem._get_instance() - - N = 100 - df = pd.DataFrame({ - 'index': np.arange(N), - 'values': np.random.randn(N) - }, columns=['index', 'values']) - - data_path = tempdir / 'data.parquet' - - table = pa.Table.from_pandas(df) - - with fs.open(data_path, 'wb') as f: - _write_table(table, f) - - metadata_path = tempdir / '_metadata' - with fs.open(metadata_path, 'wb') as f: - pq.write_metadata(table.schema, f) - - dataset = pq.ParquetDataset(tempdir, filesystem=fs, - use_legacy_dataset=True) - with pytest.warns(FutureWarning): - assert dataset.metadata_path == str(metadata_path) - - with fs.open(data_path) as f: - metadata_schema = pq.read_metadata(f).schema - assert dataset.schema.equals(metadata_schema) - - def _filter_partition(df, part_keys): predicate = np.ones(len(df), dtype=bool) @@ -883,9 +621,8 @@ def _filter_partition(df, part_keys): return df[predicate].drop(to_drop, axis=1) -@parametrize_legacy_dataset @pytest.mark.pandas -def test_filter_before_validate_schema(tempdir, use_legacy_dataset): +def test_filter_before_validate_schema(tempdir): # ARROW-4076 apply filter before schema validation # to avoid checking unneeded schemas @@ -902,16 +639,12 @@ def test_filter_before_validate_schema(tempdir, use_legacy_dataset): pq.write_table(table2, dir2 / 'data.parquet') # read single file using filter - table = pq.read_table(tempdir, filters=[[('A', '==', 0)]], - use_legacy_dataset=use_legacy_dataset) + table = pq.read_table(tempdir, filters=[[('A', '==', 0)]]) assert table.column('B').equals(pa.chunked_array([[1, 2, 3]])) @pytest.mark.pandas -@pytest.mark.filterwarnings( - "ignore:Specifying the 'metadata':FutureWarning") -@parametrize_legacy_dataset -def test_read_multiple_files(tempdir, use_legacy_dataset): +def test_read_multiple_files(tempdir): nfiles = 10 size = 5 @@ -938,8 +671,7 @@ def test_read_multiple_files(tempdir, use_legacy_dataset): (dirpath / '_SUCCESS.crc').touch() def read_multiple_files(paths, columns=None, use_threads=True, **kwargs): - dataset = pq.ParquetDataset( - paths, use_legacy_dataset=use_legacy_dataset, **kwargs) + dataset = pq.ParquetDataset(paths, **kwargs) return dataset.read(columns=columns, use_threads=use_threads) result = read_multiple_files(paths) @@ -947,37 +679,18 @@ def read_multiple_files(paths, columns=None, use_threads=True, **kwargs): assert result.equals(expected) - # Read with provided metadata - # TODO(dataset) specifying metadata not yet supported - metadata = pq.read_metadata(paths[0]) - if use_legacy_dataset: - result2 = read_multiple_files(paths, metadata=metadata) - assert result2.equals(expected) - - with pytest.warns(FutureWarning, match="Specifying the 'schema'"): - result3 = pq.ParquetDataset(dirpath, schema=metadata.schema, - use_legacy_dataset=True).read() - assert result3.equals(expected) - else: - with pytest.raises(ValueError, match="no longer supported"): - pq.read_table(paths, metadata=metadata, use_legacy_dataset=False) - # Read column subset to_read = [0, 2, 6, result.num_columns - 1] col_names = [result.field(i).name for i in to_read] - out = pq.read_table( - dirpath, columns=col_names, use_legacy_dataset=use_legacy_dataset - ) + out = pq.read_table(dirpath, columns=col_names) expected = pa.Table.from_arrays([result.column(i) for i in to_read], names=col_names, metadata=result.schema.metadata) assert out.equals(expected) # Read with multiple threads - pq.read_table( - dirpath, use_threads=True, use_legacy_dataset=use_legacy_dataset - ) + pq.read_table(dirpath, use_threads=True) # Test failure modes with non-uniform metadata bad_apple = _test_dataframe(size, seed=i).iloc[:, :4] @@ -986,31 +699,24 @@ def read_multiple_files(paths, columns=None, use_threads=True, **kwargs): t = pa.Table.from_pandas(bad_apple) _write_table(t, bad_apple_path) - if not use_legacy_dataset: - # TODO(dataset) Dataset API skips bad files - return + # TODO(dataset) Dataset API skips bad files - bad_meta = pq.read_metadata(bad_apple_path) + # bad_meta = pq.read_metadata(bad_apple_path) - with pytest.raises(ValueError): - read_multiple_files(paths + [bad_apple_path]) + # with pytest.raises(ValueError): + # read_multiple_files(paths + [bad_apple_path]) - with pytest.raises(ValueError): - read_multiple_files(paths, metadata=bad_meta) + # with pytest.raises(ValueError): + # read_multiple_files(paths, metadata=bad_meta) - mixed_paths = [bad_apple_path, paths[0]] + # mixed_paths = [bad_apple_path, paths[0]] - with pytest.raises(ValueError): - with pytest.warns(FutureWarning, match="Specifying the 'schema'"): - read_multiple_files(mixed_paths, schema=bad_meta.schema) - - with pytest.raises(ValueError): - read_multiple_files(mixed_paths) + # with pytest.raises(ValueError): + # read_multiple_files(mixed_paths) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_dataset_read_pandas(tempdir, use_legacy_dataset): +def test_dataset_read_pandas(tempdir): nfiles = 5 size = 5 @@ -1033,7 +739,7 @@ def test_dataset_read_pandas(tempdir, use_legacy_dataset): frames.append(df) paths.append(path) - dataset = pq.ParquetDataset(dirpath, use_legacy_dataset=use_legacy_dataset) + dataset = pq.ParquetDataset(dirpath) columns = ['uint8', 'strings'] result = dataset.read_pandas(columns=columns).to_pandas() expected = pd.concat([x[columns] for x in frames]) @@ -1047,10 +753,8 @@ def test_dataset_read_pandas(tempdir, use_legacy_dataset): tm.assert_frame_equal(result.reindex(columns=expected.columns), expected) -@pytest.mark.filterwarnings("ignore:'ParquetDataset:FutureWarning") @pytest.mark.pandas -@parametrize_legacy_dataset -def test_dataset_memory_map(tempdir, use_legacy_dataset): +def test_dataset_memory_map(tempdir): # ARROW-2627: Check that we can use ParquetDataset with memory-mapping dirpath = tempdir / guid() dirpath.mkdir() @@ -1061,15 +765,12 @@ def test_dataset_memory_map(tempdir, use_legacy_dataset): _write_table(table, path, version='2.6') dataset = pq.ParquetDataset( - dirpath, memory_map=True, use_legacy_dataset=use_legacy_dataset) + dirpath, memory_map=True) assert dataset.read().equals(table) - if use_legacy_dataset: - assert dataset.pieces[0].read().equals(table) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_dataset_enable_buffered_stream(tempdir, use_legacy_dataset): +def test_dataset_enable_buffered_stream(tempdir): dirpath = tempdir / guid() dirpath.mkdir() @@ -1080,19 +781,16 @@ def test_dataset_enable_buffered_stream(tempdir, use_legacy_dataset): with pytest.raises(ValueError): pq.ParquetDataset( - dirpath, buffer_size=-64, - use_legacy_dataset=use_legacy_dataset) + dirpath, buffer_size=-64) for buffer_size in [128, 1024]: dataset = pq.ParquetDataset( - dirpath, buffer_size=buffer_size, - use_legacy_dataset=use_legacy_dataset) + dirpath, buffer_size=buffer_size) assert dataset.read().equals(table) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_dataset_enable_pre_buffer(tempdir, use_legacy_dataset): +def test_dataset_enable_pre_buffer(tempdir): dirpath = tempdir / guid() dirpath.mkdir() @@ -1103,11 +801,9 @@ def test_dataset_enable_pre_buffer(tempdir, use_legacy_dataset): for pre_buffer in (True, False): dataset = pq.ParquetDataset( - dirpath, pre_buffer=pre_buffer, - use_legacy_dataset=use_legacy_dataset) + dirpath, pre_buffer=pre_buffer) assert dataset.read().equals(table) - actual = pq.read_table(dirpath, pre_buffer=pre_buffer, - use_legacy_dataset=use_legacy_dataset) + actual = pq.read_table(dirpath, pre_buffer=pre_buffer) assert actual.equals(table) @@ -1123,18 +819,14 @@ def _make_example_multifile_dataset(base_path, nfiles=10, file_nrows=5): return paths -def _assert_dataset_paths(dataset, paths, use_legacy_dataset): - if use_legacy_dataset: - assert set(map(str, paths)) == {x.path for x in dataset._pieces} - else: - paths = [str(path.as_posix()) for path in paths] - assert set(paths) == set(dataset._dataset.files) +def _assert_dataset_paths(dataset, paths): + paths = [str(path.as_posix()) for path in paths] + assert set(paths) == set(dataset.files) @pytest.mark.pandas -@parametrize_legacy_dataset @pytest.mark.parametrize('dir_prefix', ['_', '.']) -def test_ignore_private_directories(tempdir, dir_prefix, use_legacy_dataset): +def test_ignore_private_directories(tempdir, dir_prefix): dirpath = tempdir / guid() dirpath.mkdir() @@ -1144,14 +836,13 @@ def test_ignore_private_directories(tempdir, dir_prefix, use_legacy_dataset): # private directory (dirpath / '{}staging'.format(dir_prefix)).mkdir() - dataset = pq.ParquetDataset(dirpath, use_legacy_dataset=use_legacy_dataset) + dataset = pq.ParquetDataset(dirpath) - _assert_dataset_paths(dataset, paths, use_legacy_dataset) + _assert_dataset_paths(dataset, paths) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_ignore_hidden_files_dot(tempdir, use_legacy_dataset): +def test_ignore_hidden_files_dot(tempdir): dirpath = tempdir / guid() dirpath.mkdir() @@ -1164,14 +855,13 @@ def test_ignore_hidden_files_dot(tempdir, use_legacy_dataset): with (dirpath / '.private').open('wb') as f: f.write(b'gibberish') - dataset = pq.ParquetDataset(dirpath, use_legacy_dataset=use_legacy_dataset) + dataset = pq.ParquetDataset(dirpath) - _assert_dataset_paths(dataset, paths, use_legacy_dataset) + _assert_dataset_paths(dataset, paths) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_ignore_hidden_files_underscore(tempdir, use_legacy_dataset): +def test_ignore_hidden_files_underscore(tempdir): dirpath = tempdir / guid() dirpath.mkdir() @@ -1184,17 +874,14 @@ def test_ignore_hidden_files_underscore(tempdir, use_legacy_dataset): with (dirpath / '_started_321').open('wb') as f: f.write(b'abcd') - dataset = pq.ParquetDataset(dirpath, use_legacy_dataset=use_legacy_dataset) + dataset = pq.ParquetDataset(dirpath) - _assert_dataset_paths(dataset, paths, use_legacy_dataset) + _assert_dataset_paths(dataset, paths) @pytest.mark.pandas -@parametrize_legacy_dataset @pytest.mark.parametrize('dir_prefix', ['_', '.']) -def test_ignore_no_private_directories_in_base_path( - tempdir, dir_prefix, use_legacy_dataset -): +def test_ignore_no_private_directories_in_base_path(tempdir, dir_prefix): # ARROW-8427 - don't ignore explicitly listed files if parent directory # is a private directory dirpath = tempdir / "{0}data".format(dir_prefix) / guid() @@ -1203,17 +890,15 @@ def test_ignore_no_private_directories_in_base_path( paths = _make_example_multifile_dataset(dirpath, nfiles=10, file_nrows=5) - dataset = pq.ParquetDataset(paths, use_legacy_dataset=use_legacy_dataset) - _assert_dataset_paths(dataset, paths, use_legacy_dataset) + dataset = pq.ParquetDataset(paths) + _assert_dataset_paths(dataset, paths) # ARROW-9644 - don't ignore full directory with underscore in base path - dataset = pq.ParquetDataset(dirpath, use_legacy_dataset=use_legacy_dataset) - _assert_dataset_paths(dataset, paths, use_legacy_dataset) + dataset = pq.ParquetDataset(dirpath) + _assert_dataset_paths(dataset, paths) -@pytest.mark.pandas -@parametrize_legacy_dataset_fixed -def test_ignore_custom_prefixes(tempdir, use_legacy_dataset): +def test_ignore_custom_prefixes(tempdir): # ARROW-9573 - allow override of default ignore_prefixes part = ["xxx"] * 3 + ["yyy"] * 3 table = pa.table([ @@ -1221,7 +906,6 @@ def test_ignore_custom_prefixes(tempdir, use_legacy_dataset): pa.array(part).dictionary_encode(), ], names=['index', '_part']) - # TODO use_legacy_dataset ARROW-10247 pq.write_to_dataset(table, str(tempdir), partition_cols=['_part']) private_duplicate = tempdir / '_private_duplicate' @@ -1230,29 +914,23 @@ def test_ignore_custom_prefixes(tempdir, use_legacy_dataset): partition_cols=['_part']) read = pq.read_table( - tempdir, use_legacy_dataset=use_legacy_dataset, - ignore_prefixes=['_private']) + tempdir, ignore_prefixes=['_private']) assert read.equals(table) -@parametrize_legacy_dataset_fixed -def test_empty_directory(tempdir, use_legacy_dataset): - # ARROW-5310 - reading empty directory - # fails with legacy implementation +def test_empty_directory(tempdir): + # ARROW-5310 empty_dir = tempdir / 'dataset' empty_dir.mkdir() - dataset = pq.ParquetDataset( - empty_dir, use_legacy_dataset=use_legacy_dataset) + dataset = pq.ParquetDataset(empty_dir) result = dataset.read() assert result.num_rows == 0 assert result.num_columns == 0 -@pytest.mark.filterwarnings("ignore:'ParquetDataset.schema:FutureWarning") def _test_write_to_dataset_with_partitions(base_path, - use_legacy_dataset=True, filesystem=None, schema=None, index_name=None): @@ -1275,8 +953,7 @@ def _test_write_to_dataset_with_partitions(base_path, output_table = pa.Table.from_pandas(output_df, schema=schema, safe=False, preserve_index=False) pq.write_to_dataset(output_table, base_path, partition_by, - filesystem=filesystem, - use_legacy_dataset=use_legacy_dataset) + filesystem=filesystem) metadata_path = os.path.join(str(base_path), '_common_metadata') @@ -1286,19 +963,11 @@ def _test_write_to_dataset_with_partitions(base_path, else: pq.write_metadata(output_table.schema, metadata_path) - # ARROW-2891: Ensure the output_schema is preserved when writing a - # partitioned dataset dataset = pq.ParquetDataset(base_path, - filesystem=filesystem, - validate_schema=True, - use_legacy_dataset=use_legacy_dataset) + filesystem=filesystem) # ARROW-2209: Ensure the dataset schema also includes the partition columns - if use_legacy_dataset: - with pytest.warns(FutureWarning, match="'ParquetDataset.schema'"): - dataset_cols = set(dataset.schema.to_arrow_schema().names) - else: - # NB schema property is an arrow and not parquet schema - dataset_cols = set(dataset.schema.names) + # NB schema property is an arrow and not parquet schema + dataset_cols = set(dataset.schema.names) assert dataset_cols == set(output_table.schema.names) @@ -1323,7 +992,6 @@ def _test_write_to_dataset_with_partitions(base_path, def _test_write_to_dataset_no_partitions(base_path, - use_legacy_dataset=True, filesystem=None): import pandas as pd @@ -1347,7 +1015,6 @@ def _test_write_to_dataset_no_partitions(base_path, n = 5 for i in range(n): pq.write_to_dataset(output_table, base_path, - use_legacy_dataset=use_legacy_dataset, filesystem=filesystem) output_files = [file for file in filesystem.ls(str(base_path)) if file.endswith(".parquet")] @@ -1356,8 +1023,7 @@ def _test_write_to_dataset_no_partitions(base_path, # Deduplicated incoming DataFrame should match # original outgoing Dataframe input_table = pq.ParquetDataset( - base_path, filesystem=filesystem, - use_legacy_dataset=use_legacy_dataset + base_path, filesystem=filesystem ).read() input_df = input_table.to_pandas() input_df = input_df.drop_duplicates() @@ -1366,131 +1032,71 @@ def _test_write_to_dataset_no_partitions(base_path, @pytest.mark.pandas -@parametrize_legacy_dataset -def test_write_to_dataset_with_partitions(tempdir, use_legacy_dataset): - _test_write_to_dataset_with_partitions(str(tempdir), use_legacy_dataset) +def test_write_to_dataset_with_partitions(tempdir): + _test_write_to_dataset_with_partitions(str(tempdir)) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_write_to_dataset_with_partitions_and_schema( - tempdir, use_legacy_dataset -): +def test_write_to_dataset_with_partitions_and_schema(tempdir): schema = pa.schema([pa.field('group1', type=pa.string()), pa.field('group2', type=pa.string()), pa.field('num', type=pa.int64()), pa.field('nan', type=pa.int32()), pa.field('date', type=pa.timestamp(unit='us'))]) _test_write_to_dataset_with_partitions( - str(tempdir), use_legacy_dataset, schema=schema) + str(tempdir), schema=schema) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_write_to_dataset_with_partitions_and_index_name( - tempdir, use_legacy_dataset -): +def test_write_to_dataset_with_partitions_and_index_name(tempdir): _test_write_to_dataset_with_partitions( - str(tempdir), use_legacy_dataset, index_name='index_name') + str(tempdir), index_name='index_name') @pytest.mark.pandas -@parametrize_legacy_dataset -def test_write_to_dataset_no_partitions(tempdir, use_legacy_dataset): - _test_write_to_dataset_no_partitions(str(tempdir), use_legacy_dataset) +def test_write_to_dataset_no_partitions(tempdir): + _test_write_to_dataset_no_partitions(str(tempdir)) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_write_to_dataset_pathlib(tempdir, use_legacy_dataset): - _test_write_to_dataset_with_partitions( - tempdir / "test1", use_legacy_dataset) - _test_write_to_dataset_no_partitions( - tempdir / "test2", use_legacy_dataset) +def test_write_to_dataset_pathlib(tempdir): + _test_write_to_dataset_with_partitions(tempdir / "test1") + _test_write_to_dataset_no_partitions(tempdir / "test2") @pytest.mark.pandas @pytest.mark.s3 -@parametrize_legacy_dataset -def test_write_to_dataset_pathlib_nonlocal( - tempdir, s3_example_s3fs, use_legacy_dataset -): +def test_write_to_dataset_pathlib_nonlocal(tempdir, s3_example_s3fs): # pathlib paths are only accepted for local files fs, _ = s3_example_s3fs with pytest.raises(TypeError, match="path-like objects are only allowed"): _test_write_to_dataset_with_partitions( - tempdir / "test1", use_legacy_dataset, filesystem=fs) + tempdir / "test1", filesystem=fs) with pytest.raises(TypeError, match="path-like objects are only allowed"): _test_write_to_dataset_no_partitions( - tempdir / "test2", use_legacy_dataset, filesystem=fs) + tempdir / "test2", filesystem=fs) @pytest.mark.pandas @pytest.mark.s3 -@parametrize_legacy_dataset -def test_write_to_dataset_with_partitions_s3fs( - s3_example_s3fs, use_legacy_dataset -): +def test_write_to_dataset_with_partitions_s3fs(s3_example_s3fs): fs, path = s3_example_s3fs _test_write_to_dataset_with_partitions( - path, use_legacy_dataset, filesystem=fs) + path, filesystem=fs) @pytest.mark.pandas @pytest.mark.s3 -@parametrize_legacy_dataset -def test_write_to_dataset_no_partitions_s3fs( - s3_example_s3fs, use_legacy_dataset -): +def test_write_to_dataset_no_partitions_s3fs(s3_example_s3fs): fs, path = s3_example_s3fs _test_write_to_dataset_no_partitions( - path, use_legacy_dataset, filesystem=fs) + path, filesystem=fs) -@pytest.mark.filterwarnings( - "ignore:'ParquetDataset:FutureWarning", - "ignore:'partition_filename_cb':FutureWarning") -@pytest.mark.pandas -@parametrize_legacy_dataset_not_supported -def test_write_to_dataset_with_partitions_and_custom_filenames( - tempdir, use_legacy_dataset -): - output_df = pd.DataFrame({'group1': list('aaabbbbccc'), - 'group2': list('eefeffgeee'), - 'num': list(range(10)), - 'nan': [np.nan] * 10, - 'date': np.arange('2017-01-01', '2017-01-11', - dtype='datetime64[D]')}) - partition_by = ['group1', 'group2'] - output_table = pa.Table.from_pandas(output_df) - path = str(tempdir) - - def partition_filename_callback(keys): - return "{}-{}.parquet".format(*keys) - - pq.write_to_dataset(output_table, path, - partition_by, partition_filename_callback, - use_legacy_dataset=use_legacy_dataset) - - dataset = pq.ParquetDataset(path, use_legacy_dataset=use_legacy_dataset) - - # ARROW-3538: Ensure partition filenames match the given pattern - # defined in the local function partition_filename_callback - expected_basenames = [ - 'a-e.parquet', 'a-f.parquet', - 'b-e.parquet', 'b-f.parquet', - 'b-g.parquet', 'c-e.parquet' - ] - output_basenames = [os.path.basename(p.path) for p in dataset.pieces] - - assert sorted(expected_basenames) == sorted(output_basenames) - - -@pytest.mark.dataset @pytest.mark.pandas def test_write_to_dataset_filesystem(tempdir): df = pd.DataFrame({'A': [1, 2, 3]}) @@ -1502,7 +1108,7 @@ def test_write_to_dataset_filesystem(tempdir): assert result.equals(table) -def _make_dataset_for_pickling(tempdir, use_legacy_dataset=False, N=100): +def _make_dataset_for_pickling(tempdir, N=100): path = tempdir / 'data.parquet' fs = LocalFileSystem._get_instance() @@ -1525,42 +1131,22 @@ def _make_dataset_for_pickling(tempdir, use_legacy_dataset=False, N=100): pq.write_metadata(table.schema, f) dataset = pq.ParquetDataset( - tempdir, filesystem=fs, use_legacy_dataset=use_legacy_dataset) - if use_legacy_dataset: - with pytest.warns(FutureWarning): - assert dataset.metadata_path == str(metadata_path) + tempdir, filesystem=fs) return dataset @pytest.mark.pandas -@parametrize_legacy_dataset -def test_pickle_dataset(tempdir, datadir, use_legacy_dataset, pickle_module): +def test_pickle_dataset(tempdir, pickle_module): def is_pickleable(obj): return obj == pickle_module.loads(pickle_module.dumps(obj)) - dataset = _make_dataset_for_pickling(tempdir, use_legacy_dataset) + dataset = _make_dataset_for_pickling(tempdir) assert is_pickleable(dataset) - if use_legacy_dataset: - with pytest.warns(FutureWarning): - metadata = dataset.metadata - assert is_pickleable(metadata) - assert is_pickleable(metadata.schema) - assert len(metadata.schema) - for column in metadata.schema: - assert is_pickleable(column) - - for piece in dataset._pieces: - assert is_pickleable(piece) - metadata = piece.get_metadata() - assert metadata.num_row_groups - for i in range(metadata.num_row_groups): - assert is_pickleable(metadata.row_group(i)) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_partitioned_dataset(tempdir, use_legacy_dataset): +def test_partitioned_dataset(tempdir): # ARROW-3208: Segmentation fault when reading a Parquet partitioned dataset # to a Parquet file path = tempdir / "ARROW-3208" @@ -1571,27 +1157,20 @@ def test_partitioned_dataset(tempdir, use_legacy_dataset): }) table = pa.Table.from_pandas(df) pq.write_to_dataset(table, root_path=str(path), - partition_cols=['one', 'two'], - use_legacy_dataset=use_legacy_dataset) - table = pq.ParquetDataset( - path, use_legacy_dataset=use_legacy_dataset).read() + partition_cols=['one', 'two']) + table = pq.ParquetDataset(path).read() pq.write_table(table, path / "output.parquet") -@pytest.mark.pandas -@parametrize_legacy_dataset -def test_dataset_read_dictionary(tempdir, use_legacy_dataset): +def test_dataset_read_dictionary(tempdir): path = tempdir / "ARROW-3325-dataset" t1 = pa.table([[util.rands(10) for i in range(5)] * 10], names=['f0']) t2 = pa.table([[util.rands(10) for i in range(5)] * 10], names=['f0']) - pq.write_to_dataset(t1, root_path=str(path), - use_legacy_dataset=use_legacy_dataset) - pq.write_to_dataset(t2, root_path=str(path), - use_legacy_dataset=use_legacy_dataset) + pq.write_to_dataset(t1, root_path=str(path)) + pq.write_to_dataset(t2, root_path=str(path)) result = pq.ParquetDataset( - path, read_dictionary=['f0'], - use_legacy_dataset=use_legacy_dataset).read() + path, read_dictionary=['f0']).read() # The order of the chunks is non-deterministic ex_chunks = [t1[0].chunk(0).dictionary_encode(), @@ -1606,9 +1185,6 @@ def test_dataset_read_dictionary(tempdir, use_legacy_dataset): assert c1.equals(ex_chunks[0]) -@pytest.mark.dataset -@pytest.mark.pandas -@pytest.mark.filterwarnings("ignore:Passing 'use_legacy:FutureWarning") def test_read_table_schema(tempdir): # test that schema keyword is passed through in read_table table = pa.table({'a': pa.array([1, 2, 3], pa.int32())}) @@ -1627,42 +1203,24 @@ def test_read_table_schema(tempdir): expected = pa.table({'a': [1, 2, 3, 1, 2, 3]}, schema=schema) assert result.equals(expected) - # don't allow it with the legacy reader - with pytest.raises( - ValueError, match="The 'schema' argument is only supported" - ): - pq.read_table(tempdir / "data.parquet", schema=schema, - use_legacy_dataset=True) - - # using ParquetDataset directory with non-legacy implementation - result = pq.ParquetDataset( - tempdir, schema=schema, use_legacy_dataset=False - ) + result = pq.ParquetDataset(tempdir, schema=schema) expected = pa.table({'a': [1, 2, 3, 1, 2, 3]}, schema=schema) assert result.read().equals(expected) -@pytest.mark.dataset -def test_dataset_unsupported_keywords(): - - with pytest.raises(ValueError, match="not yet supported with the new"): - pq.ParquetDataset("", use_legacy_dataset=False, metadata=pa.schema([])) +def test_read_table_duplicate_column_selection(tempdir): + # test that duplicate column selection gives duplicate columns + table = pa.table({'a': pa.array([1, 2, 3], pa.int32()), + 'b': pa.array([1, 2, 3], pa.uint8())}) + pq.write_table(table, tempdir / "data.parquet") - with pytest.raises(ValueError, match="not yet supported with the new"): - pq.ParquetDataset("", use_legacy_dataset=False, validate_schema=False) + result = pq.read_table(tempdir / "data.parquet", columns=['a', 'a']) + expected_schema = pa.schema([('a', 'int32'), ('a', 'int32')]) - with pytest.raises(ValueError, match="not yet supported with the new"): - pq.ParquetDataset("", use_legacy_dataset=False, split_row_groups=True) + assert result.column_names == ['a', 'a'] + assert result.schema == expected_schema - with pytest.raises(ValueError, match="not yet supported with the new"): - pq.ParquetDataset("", use_legacy_dataset=False, metadata_nthreads=4) - with pytest.raises(ValueError, match="no longer supported"): - pq.read_table("", use_legacy_dataset=False, metadata=pa.schema([])) - - -@pytest.mark.dataset -@pytest.mark.filterwarnings("ignore:Passing 'use_legacy:FutureWarning") def test_dataset_partitioning(tempdir): import pyarrow.dataset as ds @@ -1679,42 +1237,25 @@ def test_dataset_partitioning(tempdir): # read_table part = ds.partitioning(field_names=["year", "month", "day"]) result = pq.read_table( - str(root_path), partitioning=part, use_legacy_dataset=False) + str(root_path), partitioning=part) assert result.column_names == ["a", "year", "month", "day"] result = pq.ParquetDataset( - str(root_path), partitioning=part, use_legacy_dataset=False).read() + str(root_path), partitioning=part).read() assert result.column_names == ["a", "year", "month", "day"] - # This raises an error for legacy dataset - with pytest.raises(ValueError): - pq.read_table( - str(root_path), partitioning=part, use_legacy_dataset=True) - - with pytest.raises(ValueError): - pq.ParquetDataset( - str(root_path), partitioning=part, use_legacy_dataset=True) - -@pytest.mark.dataset def test_parquet_dataset_new_filesystem(tempdir): # Ensure we can pass new FileSystem object to ParquetDataset - # (use new implementation automatically without specifying - # use_legacy_dataset=False) table = pa.table({'a': [1, 2, 3]}) pq.write_table(table, tempdir / 'data.parquet') - # don't use simple LocalFileSystem (as that gets mapped to legacy one) filesystem = fs.SubTreeFileSystem(str(tempdir), fs.LocalFileSystem()) dataset = pq.ParquetDataset('.', filesystem=filesystem) result = dataset.read() assert result.equals(table) -@pytest.mark.filterwarnings("ignore:'ParquetDataset:FutureWarning") -@parametrize_legacy_dataset -def test_parquet_dataset_partitions_piece_path_with_fsspec( - tempdir, use_legacy_dataset -): +def test_parquet_dataset_partitions_piece_path_with_fsspec(tempdir): # ARROW-10462 ensure that on Windows we properly use posix-style paths # as used by fsspec fsspec = pytest.importorskip("fsspec") @@ -1725,109 +1266,12 @@ def test_parquet_dataset_partitions_piece_path_with_fsspec( # pass a posix-style path (using "/" also on Windows) path = str(tempdir).replace("\\", "/") dataset = pq.ParquetDataset( - path, filesystem=filesystem, use_legacy_dataset=use_legacy_dataset) + path, filesystem=filesystem) # ensure the piece path is also posix-style expected = path + "/data.parquet" - assert dataset.pieces[0].path == expected - - -@pytest.mark.dataset -def test_parquet_dataset_deprecated_properties(tempdir): - table = pa.table({'a': [1, 2, 3]}) - path = tempdir / 'data.parquet' - pq.write_table(table, path) - dataset = pq.ParquetDataset(path, use_legacy_dataset=True) - - with pytest.warns(FutureWarning, match="'ParquetDataset.pieces"): - dataset.pieces - - with pytest.warns(FutureWarning, match="'ParquetDataset.partitions"): - dataset.partitions - - with pytest.warns(FutureWarning, match="'ParquetDataset.memory_map"): - dataset.memory_map - - with pytest.warns(FutureWarning, match="'ParquetDataset.read_dictio"): - dataset.read_dictionary - - with pytest.warns(FutureWarning, match="'ParquetDataset.buffer_size"): - dataset.buffer_size - - with pytest.warns(FutureWarning, match="'ParquetDataset.fs"): - dataset.fs - - with pytest.warns(FutureWarning, match="'ParquetDataset.schema'"): - dataset.schema - - with pytest.warns(FutureWarning, match="'ParquetDataset.common_metadata'"): - dataset.common_metadata - - with pytest.warns(FutureWarning, match="'ParquetDataset.metadata"): - dataset.metadata + assert dataset.fragments[0].path == expected - with pytest.warns(FutureWarning, match="'ParquetDataset.metadata_path"): - dataset.metadata_path - with pytest.warns(FutureWarning, - match="'ParquetDataset.common_metadata_path"): - dataset.common_metadata_path - - dataset2 = pq.ParquetDataset(path, use_legacy_dataset=False) - - with pytest.warns(FutureWarning, match="'ParquetDataset.pieces"): - dataset2.pieces - - -@pytest.mark.dataset -def test_parquet_write_to_dataset_deprecated_properties(tempdir): - table = pa.table({'a': [1, 2, 3]}) - path = tempdir / 'data.parquet' - - with pytest.warns(FutureWarning, - match="Passing 'use_legacy_dataset=True'"): - pq.write_to_dataset(table, path, use_legacy_dataset=True) - - # check also that legacy implementation is set when - # partition_filename_cb is specified - with pytest.warns(FutureWarning, - match="Passing 'use_legacy_dataset=True'"): - pq.write_to_dataset(table, path, - partition_filename_cb=lambda x: 'filename.parquet') - - -@pytest.mark.dataset -def test_parquet_write_to_dataset_unsupported_keywords_in_legacy(tempdir): - table = pa.table({'a': [1, 2, 3]}) - path = tempdir / 'data.parquet' - - with pytest.raises(ValueError, match="schema"): - pq.write_to_dataset(table, path, use_legacy_dataset=True, - schema=pa.schema([ - ('a', pa.int32()) - ])) - - with pytest.raises(ValueError, match="partitioning"): - pq.write_to_dataset(table, path, use_legacy_dataset=True, - partitioning=["a"]) - - with pytest.raises(ValueError, match="use_threads"): - pq.write_to_dataset(table, path, use_legacy_dataset=True, - use_threads=False) - - with pytest.raises(ValueError, match="file_visitor"): - pq.write_to_dataset(table, path, use_legacy_dataset=True, - file_visitor=lambda x: x) - - with pytest.raises(ValueError, match="existing_data_behavior"): - pq.write_to_dataset(table, path, use_legacy_dataset=True, - existing_data_behavior='error') - - with pytest.raises(ValueError, match="basename_template"): - pq.write_to_dataset(table, path, use_legacy_dataset=True, - basename_template='part-{i}.parquet') - - -@pytest.mark.dataset def test_parquet_write_to_dataset_exposed_keywords(tempdir): table = pa.table({'a': [1, 2, 3]}) path = tempdir / 'partitioning' @@ -1841,8 +1285,7 @@ def file_visitor(written_file): pq.write_to_dataset(table, path, partitioning=["a"], file_visitor=file_visitor, - basename_template=basename_template, - use_legacy_dataset=False) + basename_template=basename_template) expected_paths = { path / '1' / 'part-0.parquet', @@ -1853,53 +1296,6 @@ def file_visitor(written_file): assert paths_written_set == expected_paths -@pytest.mark.dataset -def test_write_to_dataset_conflicting_keywords(tempdir): - table = pa.table({'a': [1, 2, 3]}) - path = tempdir / 'data.parquet' - - with pytest.raises(ValueError, match="'basename_template' argument " - "is not supported by use_legacy_dataset=True"): - pq.write_to_dataset(table, path, - use_legacy_dataset=True, - partition_filename_cb=lambda x: 'filename.parquet', - basename_template='file-{i}.parquet') - with pytest.raises(ValueError, match="'partition_filename_cb' argument " - "is not supported by use_legacy_dataset=False"): - pq.write_to_dataset(table, path, - use_legacy_dataset=False, - partition_filename_cb=lambda x: 'filename.parquet', - basename_template='file-{i}.parquet') - - with pytest.raises(ValueError, match="'partitioning' argument " - "is not supported by use_legacy_dataset=True"): - pq.write_to_dataset(table, path, - use_legacy_dataset=True, - partition_cols=["a"], - partitioning=["a"]) - - with pytest.raises(ValueError, match="'partition_cols' argument " - "is not supported by use_legacy_dataset=False"): - pq.write_to_dataset(table, path, - use_legacy_dataset=False, - partition_cols=["a"], - partitioning=["a"]) - - with pytest.raises(ValueError, match="'file_visitor' argument " - "is not supported by use_legacy_dataset=True"): - pq.write_to_dataset(table, path, - use_legacy_dataset=True, - metadata_collector=[], - file_visitor=lambda x: x) - with pytest.raises(ValueError, match="'metadata_collector' argument " - "is not supported by use_legacy_dataset=False"): - pq.write_to_dataset(table, path, - use_legacy_dataset=False, - metadata_collector=[], - file_visitor=lambda x: x) - - -@pytest.mark.dataset @pytest.mark.parametrize("write_dataset_kwarg", ( ("create_dir", True), ("create_dir", False), @@ -1926,8 +1322,7 @@ def test_write_to_dataset_kwargs_passed(tempdir, write_dataset_kwarg): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_write_to_dataset_category_observed(tempdir, use_legacy_dataset): +def test_write_to_dataset_category_observed(tempdir): # if we partition on a categorical variable with "unobserved" categories # (values present in the dictionary, but not in the actual data) # ensure those are not creating empty files/directories @@ -1938,8 +1333,7 @@ def test_write_to_dataset_category_observed(tempdir, use_legacy_dataset): table = pa.table(df) path = tempdir / "dataset" pq.write_to_dataset( - table, tempdir / "dataset", partition_cols=["cat"], - use_legacy_dataset=use_legacy_dataset + table, tempdir / "dataset", partition_cols=["cat"] ) subdirs = [f.name for f in path.iterdir() if f.is_dir()] assert len(subdirs) == 2 diff --git a/python/pyarrow/tests/parquet/test_datetime.py b/python/pyarrow/tests/parquet/test_datetime.py index f97c451df7ad7..6a9cbd4f73d4f 100644 --- a/python/pyarrow/tests/parquet/test_datetime.py +++ b/python/pyarrow/tests/parquet/test_datetime.py @@ -23,8 +23,7 @@ import pytest import pyarrow as pa -from pyarrow.tests.parquet.common import ( - _check_roundtrip, parametrize_legacy_dataset) +from pyarrow.tests.parquet.common import _check_roundtrip try: import pyarrow.parquet as pq @@ -48,8 +47,7 @@ @pytest.mark.pandas -@parametrize_legacy_dataset -def test_pandas_parquet_datetime_tz(use_legacy_dataset): +def test_pandas_parquet_datetime_tz(): # Pandas v2 defaults to [ns], but Arrow defaults to [us] time units # so we need to cast the pandas dtype. Pandas v1 will always silently # coerce to [ns] due to lack of non-[ns] support. @@ -69,21 +67,19 @@ def test_pandas_parquet_datetime_tz(use_legacy_dataset): _write_table(arrow_table, f) f.seek(0) - table_read = pq.read_pandas(f, use_legacy_dataset=use_legacy_dataset) + table_read = pq.read_pandas(f) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_datetime_timezone_tzinfo(use_legacy_dataset): +def test_datetime_timezone_tzinfo(): value = datetime.datetime(2018, 1, 1, 1, 23, 45, tzinfo=datetime.timezone.utc) df = pd.DataFrame({'foo': [value]}) - _roundtrip_pandas_dataframe( - df, write_kwargs={}, use_legacy_dataset=use_legacy_dataset) + _roundtrip_pandas_dataframe(df, write_kwargs={}) @pytest.mark.pandas diff --git a/python/pyarrow/tests/parquet/test_pandas.py b/python/pyarrow/tests/parquet/test_pandas.py index 0ed305bff1945..f194d12876968 100644 --- a/python/pyarrow/tests/parquet/test_pandas.py +++ b/python/pyarrow/tests/parquet/test_pandas.py @@ -23,8 +23,6 @@ import pyarrow as pa from pyarrow.fs import LocalFileSystem, SubTreeFileSystem -from pyarrow.tests.parquet.common import ( - parametrize_legacy_dataset, parametrize_legacy_dataset_not_supported) from pyarrow.util import guid from pyarrow.vendored.version import Version @@ -101,8 +99,7 @@ def test_merging_parquet_tables_with_different_pandas_metadata(tempdir): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_pandas_parquet_column_multiindex(tempdir, use_legacy_dataset): +def test_pandas_parquet_column_multiindex(tempdir): df = alltypes_sample(size=10) df.columns = pd.MultiIndex.from_tuples( list(zip(df.columns, df.columns[::-1])), @@ -115,17 +112,13 @@ def test_pandas_parquet_column_multiindex(tempdir, use_legacy_dataset): _write_table(arrow_table, filename) - table_read = pq.read_pandas( - filename, use_legacy_dataset=use_legacy_dataset) + table_read = pq.read_pandas(filename) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_pandas_parquet_2_0_roundtrip_read_pandas_no_index_written( - tempdir, use_legacy_dataset -): +def test_pandas_parquet_2_0_roundtrip_read_pandas_no_index_written(tempdir): df = alltypes_sample(size=10000) filename = tempdir / 'pandas_roundtrip.parquet' @@ -137,8 +130,7 @@ def test_pandas_parquet_2_0_roundtrip_read_pandas_no_index_written( assert js['columns'] _write_table(arrow_table, filename) - table_read = pq.read_pandas( - filename, use_legacy_dataset=use_legacy_dataset) + table_read = pq.read_pandas(filename) js = table_read.schema.pandas_metadata assert not js['index_columns'] @@ -150,52 +142,20 @@ def test_pandas_parquet_2_0_roundtrip_read_pandas_no_index_written( tm.assert_frame_equal(df, df_read) -# TODO(dataset) duplicate column selection actually gives duplicate columns now -@pytest.mark.pandas -@parametrize_legacy_dataset_not_supported -def test_pandas_column_selection(tempdir, use_legacy_dataset): - size = 10000 - np.random.seed(0) - df = pd.DataFrame({ - 'uint8': np.arange(size, dtype=np.uint8), - 'uint16': np.arange(size, dtype=np.uint16) - }) - filename = tempdir / 'pandas_roundtrip.parquet' - arrow_table = pa.Table.from_pandas(df) - _write_table(arrow_table, filename) - table_read = _read_table( - filename, columns=['uint8'], use_legacy_dataset=use_legacy_dataset) - df_read = table_read.to_pandas() - - tm.assert_frame_equal(df[['uint8']], df_read) - - # ARROW-4267: Selection of duplicate columns still leads to these columns - # being read uniquely. - table_read = _read_table( - filename, columns=['uint8', 'uint8'], - use_legacy_dataset=use_legacy_dataset) - df_read = table_read.to_pandas() - - tm.assert_frame_equal(df[['uint8']], df_read) - - @pytest.mark.pandas -@parametrize_legacy_dataset -def test_pandas_parquet_native_file_roundtrip(tempdir, use_legacy_dataset): +def test_pandas_parquet_native_file_roundtrip(): df = _test_dataframe(10000) arrow_table = pa.Table.from_pandas(df) imos = pa.BufferOutputStream() _write_table(arrow_table, imos, version='2.6') buf = imos.getvalue() reader = pa.BufferReader(buf) - df_read = _read_table( - reader, use_legacy_dataset=use_legacy_dataset).to_pandas() + df_read = _read_table(reader).to_pandas() tm.assert_frame_equal(df, df_read) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_read_pandas_column_subset(tempdir, use_legacy_dataset): +def test_read_pandas_column_subset(): df = _test_dataframe(10000) arrow_table = pa.Table.from_pandas(df) imos = pa.BufferOutputStream() @@ -204,27 +164,24 @@ def test_read_pandas_column_subset(tempdir, use_legacy_dataset): reader = pa.BufferReader(buf) df_read = pq.read_pandas( reader, columns=['strings', 'uint8'], - use_legacy_dataset=use_legacy_dataset ).to_pandas() tm.assert_frame_equal(df[['strings', 'uint8']], df_read) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_pandas_parquet_empty_roundtrip(tempdir, use_legacy_dataset): +def test_pandas_parquet_empty_roundtrip(): df = _test_dataframe(0) arrow_table = pa.Table.from_pandas(df) imos = pa.BufferOutputStream() _write_table(arrow_table, imos, version='2.6') buf = imos.getvalue() reader = pa.BufferReader(buf) - df_read = _read_table( - reader, use_legacy_dataset=use_legacy_dataset).to_pandas() + df_read = _read_table(reader).to_pandas() tm.assert_frame_equal(df, df_read) @pytest.mark.pandas -def test_pandas_can_write_nested_data(tempdir): +def test_pandas_can_write_nested_data(): data = { "agg_col": [ {"page_type": 1}, @@ -241,8 +198,7 @@ def test_pandas_can_write_nested_data(tempdir): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_pandas_parquet_pyfile_roundtrip(tempdir, use_legacy_dataset): +def test_pandas_parquet_pyfile_roundtrip(tempdir): filename = tempdir / 'pandas_pyfile_roundtrip.parquet' size = 5 df = pd.DataFrame({ @@ -260,14 +216,13 @@ def test_pandas_parquet_pyfile_roundtrip(tempdir, use_legacy_dataset): data = io.BytesIO(filename.read_bytes()) - table_read = _read_table(data, use_legacy_dataset=use_legacy_dataset) + table_read = _read_table(data) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_pandas_parquet_configuration_options(tempdir, use_legacy_dataset): +def test_pandas_parquet_configuration_options(tempdir): size = 10000 np.random.seed(0) df = pd.DataFrame({ @@ -289,16 +244,14 @@ def test_pandas_parquet_configuration_options(tempdir, use_legacy_dataset): for use_dictionary in [True, False]: _write_table(arrow_table, filename, version='2.6', use_dictionary=use_dictionary) - table_read = _read_table( - filename, use_legacy_dataset=use_legacy_dataset) + table_read = _read_table(filename) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) for write_statistics in [True, False]: _write_table(arrow_table, filename, version='2.6', write_statistics=write_statistics) - table_read = _read_table(filename, - use_legacy_dataset=use_legacy_dataset) + table_read = _read_table(filename) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) @@ -308,8 +261,7 @@ def test_pandas_parquet_configuration_options(tempdir, use_legacy_dataset): continue _write_table(arrow_table, filename, version='2.6', compression=compression) - table_read = _read_table( - filename, use_legacy_dataset=use_legacy_dataset) + table_read = _read_table(filename) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) @@ -327,8 +279,7 @@ def test_spark_flavor_preserves_pandas_metadata(): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_index_column_name_duplicate(tempdir, use_legacy_dataset): +def test_index_column_name_duplicate(tempdir): data = { 'close': { pd.Timestamp('2017-06-30 01:31:00'): 154.99958999999998, @@ -352,14 +303,13 @@ def test_index_column_name_duplicate(tempdir, use_legacy_dataset): tdfx = pa.Table.from_pandas(dfx) _write_table(tdfx, path) - arrow_table = _read_table(path, use_legacy_dataset=use_legacy_dataset) + arrow_table = _read_table(path) result_df = arrow_table.to_pandas() tm.assert_frame_equal(result_df, dfx) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_multiindex_duplicate_values(tempdir, use_legacy_dataset): +def test_multiindex_duplicate_values(tempdir): num_rows = 3 numbers = list(range(num_rows)) index = pd.MultiIndex.from_arrays( @@ -373,7 +323,7 @@ def test_multiindex_duplicate_values(tempdir, use_legacy_dataset): filename = tempdir / 'dup_multi_index_levels.parquet' _write_table(table, filename) - result_table = _read_table(filename, use_legacy_dataset=use_legacy_dataset) + result_table = _read_table(filename) assert table.equals(result_table) result_df = result_table.to_pandas() @@ -381,8 +331,7 @@ def test_multiindex_duplicate_values(tempdir, use_legacy_dataset): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_backwards_compatible_index_naming(datadir, use_legacy_dataset): +def test_backwards_compatible_index_naming(datadir): expected_string = b"""\ carat cut color clarity depth table price x y z 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 @@ -397,17 +346,13 @@ def test_backwards_compatible_index_naming(datadir, use_legacy_dataset): 0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39""" expected = pd.read_csv(io.BytesIO(expected_string), sep=r'\s{2,}', index_col=None, header=0, engine='python') - table = _read_table( - datadir / 'v0.7.1.parquet', use_legacy_dataset=use_legacy_dataset) + table = _read_table(datadir / 'v0.7.1.parquet') result = table.to_pandas() tm.assert_frame_equal(result, expected) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_backwards_compatible_index_multi_level_named( - datadir, use_legacy_dataset -): +def test_backwards_compatible_index_multi_level_named(datadir): expected_string = b"""\ carat cut color clarity depth table price x y z 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 @@ -426,17 +371,13 @@ def test_backwards_compatible_index_multi_level_named( header=0, engine='python' ).sort_index() - table = _read_table(datadir / 'v0.7.1.all-named-index.parquet', - use_legacy_dataset=use_legacy_dataset) + table = _read_table(datadir / 'v0.7.1.all-named-index.parquet') result = table.to_pandas() tm.assert_frame_equal(result, expected) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_backwards_compatible_index_multi_level_some_named( - datadir, use_legacy_dataset -): +def test_backwards_compatible_index_multi_level_some_named(datadir): expected_string = b"""\ carat cut color clarity depth table price x y z 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 @@ -456,17 +397,13 @@ def test_backwards_compatible_index_multi_level_some_named( ).sort_index() expected.index = expected.index.set_names(['cut', None, 'clarity']) - table = _read_table(datadir / 'v0.7.1.some-named-index.parquet', - use_legacy_dataset=use_legacy_dataset) + table = _read_table(datadir / 'v0.7.1.some-named-index.parquet') result = table.to_pandas() tm.assert_frame_equal(result, expected) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_backwards_compatible_column_metadata_handling( - datadir, use_legacy_dataset -): +def test_backwards_compatible_column_metadata_handling(datadir): expected = pd.DataFrame( {'a': [1, 2, 3], 'b': [.1, .2, .3], 'c': pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')}) @@ -476,19 +413,18 @@ def test_backwards_compatible_column_metadata_handling( names=['index', None]) path = datadir / 'v0.7.1.column-metadata-handling.parquet' - table = _read_table(path, use_legacy_dataset=use_legacy_dataset) + table = _read_table(path) result = table.to_pandas() tm.assert_frame_equal(result, expected) table = _read_table( - path, columns=['a'], use_legacy_dataset=use_legacy_dataset) + path, columns=['a']) result = table.to_pandas() tm.assert_frame_equal(result, expected[['a']].reset_index(drop=True)) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_categorical_index_survives_roundtrip(use_legacy_dataset): +def test_categorical_index_survives_roundtrip(): # ARROW-3652, addressed by ARROW-3246 df = pd.DataFrame([['a', 'b'], ['c', 'd']], columns=['c1', 'c2']) df['c1'] = df['c1'].astype('category') @@ -497,15 +433,13 @@ def test_categorical_index_survives_roundtrip(use_legacy_dataset): table = pa.Table.from_pandas(df) bos = pa.BufferOutputStream() pq.write_table(table, bos) - ref_df = pq.read_pandas( - bos.getvalue(), use_legacy_dataset=use_legacy_dataset).to_pandas() + ref_df = pq.read_pandas(bos.getvalue()).to_pandas() assert isinstance(ref_df.index, pd.CategoricalIndex) assert ref_df.index.equals(df.index) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_categorical_order_survives_roundtrip(use_legacy_dataset): +def test_categorical_order_survives_roundtrip(): # ARROW-6302 df = pd.DataFrame({"a": pd.Categorical( ["a", "b", "c", "a"], categories=["b", "c", "d"], ordered=True)}) @@ -515,15 +449,13 @@ def test_categorical_order_survives_roundtrip(use_legacy_dataset): pq.write_table(table, bos) contents = bos.getvalue() - result = pq.read_pandas( - contents, use_legacy_dataset=use_legacy_dataset).to_pandas() + result = pq.read_pandas(contents).to_pandas() tm.assert_frame_equal(result, df) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_pandas_categorical_na_type_row_groups(use_legacy_dataset): +def test_pandas_categorical_na_type_row_groups(): # ARROW-5085 df = pd.DataFrame({"col": [None] * 100, "int": [1.0] * 100}) df_category = df.astype({"col": "category", "int": "category"}) @@ -533,8 +465,7 @@ def test_pandas_categorical_na_type_row_groups(use_legacy_dataset): # it works pq.write_table(table_cat, buf, version='2.6', chunk_size=10) - result = pq.read_table( - buf.getvalue(), use_legacy_dataset=use_legacy_dataset) + result = pq.read_table(buf.getvalue()) # Result is non-categorical assert result[0].equals(table[0]) @@ -542,8 +473,7 @@ def test_pandas_categorical_na_type_row_groups(use_legacy_dataset): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_pandas_categorical_roundtrip(use_legacy_dataset): +def test_pandas_categorical_roundtrip(): # ARROW-5480, this was enabled by ARROW-3246 # Have one of the categories unobserved and include a null (-1) @@ -555,8 +485,7 @@ def test_pandas_categorical_roundtrip(use_legacy_dataset): buf = pa.BufferOutputStream() pq.write_table(pa.table(df), buf) - result = pq.read_table( - buf.getvalue(), use_legacy_dataset=use_legacy_dataset).to_pandas() + result = pq.read_table(buf.getvalue()).to_pandas() assert result.x.dtype == 'category' assert (result.x.cat.categories == categories).all() tm.assert_frame_equal(result, df) @@ -587,41 +516,28 @@ def test_categories_with_string_pyarrow_dtype(tempdir): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_write_to_dataset_pandas_preserve_extensiondtypes( - tempdir, use_legacy_dataset -): +def test_write_to_dataset_pandas_preserve_extensiondtypes(tempdir): df = pd.DataFrame({'part': 'a', "col": [1, 2, 3]}) df['col'] = df['col'].astype("Int64") table = pa.table(df) pq.write_to_dataset( table, str(tempdir / "case1"), partition_cols=['part'], - use_legacy_dataset=use_legacy_dataset ) - result = pq.read_table( - str(tempdir / "case1"), use_legacy_dataset=use_legacy_dataset - ).to_pandas() + result = pq.read_table(str(tempdir / "case1")).to_pandas() tm.assert_frame_equal(result[["col"]], df[["col"]]) - pq.write_to_dataset( - table, str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset - ) - result = pq.read_table( - str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset - ).to_pandas() + pq.write_to_dataset(table, str(tempdir / "case2")) + result = pq.read_table(str(tempdir / "case2")).to_pandas() tm.assert_frame_equal(result[["col"]], df[["col"]]) pq.write_table(table, str(tempdir / "data.parquet")) - result = pq.read_table( - str(tempdir / "data.parquet"), use_legacy_dataset=use_legacy_dataset - ).to_pandas() + result = pq.read_table(str(tempdir / "data.parquet")).to_pandas() tm.assert_frame_equal(result[["col"]], df[["col"]]) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_write_to_dataset_pandas_preserve_index(tempdir, use_legacy_dataset): +def test_write_to_dataset_pandas_preserve_index(tempdir): # ARROW-8251 - preserve pandas index in roundtrip df = pd.DataFrame({'part': ['a', 'a', 'b'], "col": [1, 2, 3]}) @@ -632,34 +548,24 @@ def test_write_to_dataset_pandas_preserve_index(tempdir, use_legacy_dataset): pq.write_to_dataset( table, str(tempdir / "case1"), partition_cols=['part'], - use_legacy_dataset=use_legacy_dataset ) - result = pq.read_table( - str(tempdir / "case1"), use_legacy_dataset=use_legacy_dataset - ).to_pandas() + result = pq.read_table(str(tempdir / "case1")).to_pandas() tm.assert_frame_equal(result, df_cat) - pq.write_to_dataset( - table, str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset - ) - result = pq.read_table( - str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset - ).to_pandas() + pq.write_to_dataset(table, str(tempdir / "case2")) + result = pq.read_table(str(tempdir / "case2")).to_pandas() tm.assert_frame_equal(result, df) pq.write_table(table, str(tempdir / "data.parquet")) - result = pq.read_table( - str(tempdir / "data.parquet"), use_legacy_dataset=use_legacy_dataset - ).to_pandas() + result = pq.read_table(str(tempdir / "data.parquet")).to_pandas() tm.assert_frame_equal(result, df) @pytest.mark.pandas -@parametrize_legacy_dataset @pytest.mark.parametrize('preserve_index', [True, False, None]) @pytest.mark.parametrize('metadata_fname', ["_metadata", "_common_metadata"]) def test_dataset_read_pandas_common_metadata( - tempdir, use_legacy_dataset, preserve_index, metadata_fname + tempdir, preserve_index, metadata_fname ): # ARROW-1103 nfiles = 5 @@ -696,7 +602,7 @@ def test_dataset_read_pandas_common_metadata( ) pq.write_metadata(table_for_metadata.schema, dirpath / metadata_fname) - dataset = pq.ParquetDataset(dirpath, use_legacy_dataset=use_legacy_dataset) + dataset = pq.ParquetDataset(dirpath) columns = ['uint8', 'strings'] result = dataset.read_pandas(columns=columns).to_pandas() expected = pd.concat([x[columns] for x in frames]) diff --git a/python/pyarrow/tests/parquet/test_parquet_file.py b/python/pyarrow/tests/parquet/test_parquet_file.py index 9f920206a107e..93097a1afaac9 100644 --- a/python/pyarrow/tests/parquet/test_parquet_file.py +++ b/python/pyarrow/tests/parquet/test_parquet_file.py @@ -18,7 +18,6 @@ import io import os import sys -from unittest import mock import pytest @@ -296,28 +295,6 @@ def test_parquet_file_explicitly_closed(tempdir): table = pa.table({'col1': [0, 1], 'col2': [0, 1]}) pq.write_table(table, fn) - # read_table (legacy) with opened file (will leave open) - with open(fn, 'rb') as f: - pq.read_table(f, use_legacy_dataset=True) - assert not f.closed # Didn't close it internally after read_table - - # read_table (legacy) with unopened file (will close) - with mock.patch.object(pq.ParquetFile, "close") as mock_close: - pq.read_table(fn, use_legacy_dataset=True) - mock_close.assert_called() - - # ParquetDataset test (legacy) with unopened file (will close) - with mock.patch.object(pq.ParquetFile, "close") as mock_close: - pq.ParquetDataset(fn, use_legacy_dataset=True).read() - mock_close.assert_called() - - # ParquetDataset test (legacy) with opened file (will leave open) - with open(fn, 'rb') as f: - # ARROW-8075: support ParquetDataset from file-like, not just path-like - with pytest.raises(TypeError, match='not a path-like object'): - pq.ParquetDataset(f, use_legacy_dataset=True).read() - assert not f.closed - # ParquetFile with opened file (will leave open) with open(fn, 'rb') as f: with pq.ParquetFile(f) as p: @@ -338,7 +315,7 @@ def test_parquet_file_explicitly_closed(tempdir): @pytest.mark.s3 @pytest.mark.parametrize("use_uri", (True, False)) -def test_parquet_file_with_filesystem(tempdir, s3_example_fs, use_uri): +def test_parquet_file_with_filesystem(s3_example_fs, use_uri): s3_fs, s3_uri, s3_path = s3_example_fs args = (s3_uri if use_uri else s3_path,) diff --git a/python/pyarrow/tests/parquet/test_parquet_writer.py b/python/pyarrow/tests/parquet/test_parquet_writer.py index b902541015aa2..16584684f5c7f 100644 --- a/python/pyarrow/tests/parquet/test_parquet_writer.py +++ b/python/pyarrow/tests/parquet/test_parquet_writer.py @@ -20,7 +20,6 @@ import pyarrow as pa from pyarrow import fs from pyarrow.filesystem import FileSystem, LocalFileSystem -from pyarrow.tests.parquet.common import parametrize_legacy_dataset try: import pyarrow.parquet as pq @@ -44,8 +43,7 @@ @pytest.mark.pandas -@parametrize_legacy_dataset -def test_parquet_incremental_file_build(tempdir, use_legacy_dataset): +def test_parquet_incremental_file_build(tempdir): df = _test_dataframe(100) df['unique_id'] = 0 @@ -65,8 +63,7 @@ def test_parquet_incremental_file_build(tempdir, use_legacy_dataset): writer.close() buf = out.getvalue() - result = _read_table( - pa.BufferReader(buf), use_legacy_dataset=use_legacy_dataset) + result = _read_table(pa.BufferReader(buf)) expected = pd.concat(frames, ignore_index=True) tm.assert_frame_equal(result.to_pandas(), expected) @@ -105,8 +102,7 @@ def test_parquet_invalid_writer(tempdir): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_parquet_writer_context_obj(tempdir, use_legacy_dataset): +def test_parquet_writer_context_obj(tempdir): df = _test_dataframe(100) df['unique_id'] = 0 @@ -124,18 +120,14 @@ def test_parquet_writer_context_obj(tempdir, use_legacy_dataset): frames.append(df.copy()) buf = out.getvalue() - result = _read_table( - pa.BufferReader(buf), use_legacy_dataset=use_legacy_dataset) + result = _read_table(pa.BufferReader(buf)) expected = pd.concat(frames, ignore_index=True) tm.assert_frame_equal(result.to_pandas(), expected) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_parquet_writer_context_obj_with_exception( - tempdir, use_legacy_dataset -): +def test_parquet_writer_context_obj_with_exception(tempdir): df = _test_dataframe(100) df['unique_id'] = 0 @@ -160,8 +152,7 @@ def test_parquet_writer_context_obj_with_exception( assert str(e) == error_text buf = out.getvalue() - result = _read_table( - pa.BufferReader(buf), use_legacy_dataset=use_legacy_dataset) + result = _read_table(pa.BufferReader(buf)) expected = pd.concat(frames, ignore_index=True) tm.assert_frame_equal(result.to_pandas(), expected) @@ -340,8 +331,7 @@ def test_parquet_writer_filesystem_buffer_raises(): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_parquet_writer_with_caller_provided_filesystem(use_legacy_dataset): +def test_parquet_writer_with_caller_provided_filesystem(): out = pa.BufferOutputStream() class CustomFS(FileSystem): @@ -368,8 +358,7 @@ def open(self, path, mode='rb'): assert out.closed buf = out.getvalue() - table_read = _read_table( - pa.BufferReader(buf), use_legacy_dataset=use_legacy_dataset) + table_read = _read_table(pa.BufferReader(buf)) df_read = table_read.to_pandas() tm.assert_frame_equal(df_read, df) diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index a37eb1e426f7a..e2bb4400c8bde 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -1148,7 +1148,6 @@ def _create_dataset_for_fragments(tempdir, chunk_size=None, filesystem=None): path = str(tempdir / "test_parquet_dataset") - # write_to_dataset currently requires pandas pq.write_to_dataset(table, path, partition_cols=["part"], chunk_size=chunk_size) dataset = ds.dataset( @@ -1158,10 +1157,7 @@ def _create_dataset_for_fragments(tempdir, chunk_size=None, filesystem=None): return table, dataset -@pytest.mark.pandas @pytest.mark.parquet -@pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning") def test_fragments(tempdir, dataset_reader): table, dataset = _create_dataset_for_fragments(tempdir) @@ -1208,10 +1204,7 @@ def test_fragments_implicit_cast(tempdir): assert len(list(fragments)) == 1 -@pytest.mark.pandas @pytest.mark.parquet -@pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning") def test_fragments_reconstruct(tempdir, dataset_reader, pickle_module): table, dataset = _create_dataset_for_fragments(tempdir) @@ -1272,10 +1265,7 @@ def assert_yields_projected(fragment, row_slice, dataset_reader.to_table(new_fragment, filter=ds.field('part') == 'a') -@pytest.mark.pandas @pytest.mark.parquet -@pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning") def test_fragments_parquet_row_groups(tempdir, dataset_reader): table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=2) @@ -1326,8 +1316,6 @@ def test_fragments_parquet_num_row_groups(tempdir): @pytest.mark.pandas @pytest.mark.parquet def test_fragments_parquet_row_groups_dictionary(tempdir, dataset_reader): - import pandas as pd - df = pd.DataFrame(dict(col1=['a', 'b'], col2=[1, 2])) df['col1'] = df['col1'].astype("category") @@ -1340,10 +1328,7 @@ def test_fragments_parquet_row_groups_dictionary(tempdir, dataset_reader): assert (df.iloc[0] == result.to_pandas()).all().all() -@pytest.mark.pandas @pytest.mark.parquet -@pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning") def test_fragments_parquet_ensure_metadata(tempdir, open_logging_fs, pickle_module): fs, assert_opens = open_logging_fs _, dataset = _create_dataset_for_fragments( @@ -1384,7 +1369,6 @@ def test_fragments_parquet_ensure_metadata(tempdir, open_logging_fs, pickle_modu assert row_group.statistics is not None -@pytest.mark.pandas @pytest.mark.parquet def test_fragments_parquet_pickle_no_metadata(tempdir, open_logging_fs, pickle_module): # https://issues.apache.org/jira/browse/ARROW-15796 @@ -1454,16 +1438,13 @@ def _create_dataset_all_types(tempdir, chunk_size=None): path = str(tempdir / "test_parquet_dataset_all_types") # write_to_dataset currently requires pandas - pq.write_to_dataset(table, path, use_legacy_dataset=True, - chunk_size=chunk_size) + pq.write_to_dataset(table, path, chunk_size=chunk_size) return table, ds.dataset(path, format="parquet", partitioning="hive") @pytest.mark.pandas @pytest.mark.parquet -@pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning") def test_parquet_fragment_statistics(tempdir): table, dataset = _create_dataset_all_types(tempdir) @@ -1529,10 +1510,7 @@ def test_parquet_empty_row_group_statistics(tempdir): assert fragments[0].row_groups[0].statistics == {} -@pytest.mark.pandas @pytest.mark.parquet -@pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning") def test_fragments_parquet_row_groups_predicate(tempdir): table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=2) @@ -1555,10 +1533,7 @@ def test_fragments_parquet_row_groups_predicate(tempdir): assert len(row_group_fragments) == 0 -@pytest.mark.pandas @pytest.mark.parquet -@pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning") def test_fragments_parquet_row_groups_reconstruct(tempdir, dataset_reader, pickle_module): table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=2) @@ -1600,10 +1575,7 @@ def test_fragments_parquet_row_groups_reconstruct(tempdir, dataset_reader, dataset_reader.to_table(new_fragment) -@pytest.mark.pandas @pytest.mark.parquet -@pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning") def test_fragments_parquet_subset_ids(tempdir, open_logging_fs, dataset_reader): fs, assert_opens = open_logging_fs @@ -1631,10 +1603,7 @@ def test_fragments_parquet_subset_ids(tempdir, open_logging_fs, assert result.equals(table[:0]) -@pytest.mark.pandas @pytest.mark.parquet -@pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning") def test_fragments_parquet_subset_filter(tempdir, open_logging_fs, dataset_reader): fs, assert_opens = open_logging_fs @@ -1666,10 +1635,7 @@ def test_fragments_parquet_subset_filter(tempdir, open_logging_fs, assert subfrag.num_row_groups == 4 -@pytest.mark.pandas @pytest.mark.parquet -@pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning") def test_fragments_parquet_subset_invalid(tempdir): _, dataset = _create_dataset_for_fragments(tempdir, chunk_size=1) fragment = list(dataset.get_fragments())[0] @@ -3591,10 +3557,7 @@ def test_parquet_dataset_factory_fsspec(tempdir): @pytest.mark.parquet @pytest.mark.pandas # write_to_dataset currently requires pandas -@pytest.mark.parametrize('use_legacy_dataset', [False, True]) -@pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning") -def test_parquet_dataset_factory_roundtrip(tempdir, use_legacy_dataset): +def test_parquet_dataset_factory_roundtrip(tempdir): # Simple test to ensure we can roundtrip dataset to # _metadata/common_metadata and back. A more complex test # using partitioning will have to wait for ARROW-13269. The @@ -3606,7 +3569,6 @@ def test_parquet_dataset_factory_roundtrip(tempdir, use_legacy_dataset): metadata_collector = [] pq.write_to_dataset( table, str(root_path), metadata_collector=metadata_collector, - use_legacy_dataset=use_legacy_dataset ) metadata_path = str(root_path / '_metadata') # write _metadata file @@ -3820,7 +3782,6 @@ def test_dataset_project_only_partition_columns(tempdir, dataset_reader): @pytest.mark.parquet @pytest.mark.pandas def test_dataset_project_null_column(tempdir, dataset_reader): - import pandas as pd df = pd.DataFrame({"col": np.array([None, None, None], dtype='object')}) f = tempdir / "test_dataset_project_null_column.parquet" @@ -3930,8 +3891,7 @@ def test_write_to_dataset_given_null_just_works(tempdir): 'col': list(range(4))}, schema=schema) path = str(tempdir / 'test_dataset') - pq.write_to_dataset(table, path, partition_cols=[ - 'part'], use_legacy_dataset=False) + pq.write_to_dataset(table, path, partition_cols=['part']) actual_table = pq.read_table(tempdir / 'test_dataset') # column.equals can handle the difference in chunking but not the fact @@ -3941,28 +3901,6 @@ def test_write_to_dataset_given_null_just_works(tempdir): assert actual_table.column('col').equals(table.column('col')) -@pytest.mark.parquet -@pytest.mark.pandas -@pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning") -def test_legacy_write_to_dataset_drops_null(tempdir): - schema = pa.schema([ - pa.field('col', pa.int64()), - pa.field('part', pa.dictionary(pa.int32(), pa.string())) - ]) - table = pa.table({'part': ['a', 'a', None, None], - 'col': list(range(4))}, schema=schema) - expected = pa.table( - {'part': ['a', 'a'], 'col': list(range(2))}, schema=schema) - - path = str(tempdir / 'test_dataset') - pq.write_to_dataset(table, path, partition_cols=[ - 'part'], use_legacy_dataset=True) - - actual = pq.read_table(tempdir / 'test_dataset') - assert actual == expected - - def _sort_table(tab, sort_col): import pyarrow.compute as pc sorted_indices = pc.sort_indices( diff --git a/python/pyarrow/tests/test_hdfs.py b/python/pyarrow/tests/test_hdfs.py index 511dbf9a1c4e1..5b94c200f35de 100644 --- a/python/pyarrow/tests/test_hdfs.py +++ b/python/pyarrow/tests/test_hdfs.py @@ -27,7 +27,7 @@ from pyarrow.tests import util from pyarrow.tests.parquet.common import _test_dataframe from pyarrow.tests.parquet.test_dataset import ( - _test_read_common_metadata_files, _test_write_to_dataset_with_partitions, + _test_write_to_dataset_with_partitions, _test_write_to_dataset_no_partitions ) from pyarrow.util import guid @@ -309,6 +309,9 @@ def _write_multiple_hdfs_pq_files(self, tmpdir): expected = pa.concat_tables(test_data) return expected + @pytest.mark.xfail(reason="legacy.FileSystem not supported with ParquetDataset " + "due to legacy path being removed in PyArrow 15.0.0.", + raises=TypeError) @pytest.mark.pandas @pytest.mark.parquet def test_read_multiple_parquet_files(self): @@ -343,6 +346,9 @@ def test_read_multiple_parquet_files_with_uri(self): expected.to_pandas() ) + @pytest.mark.xfail(reason="legacy.FileSystem not supported with ParquetDataset " + "due to legacy path being removed in PyArrow 15.0.0.", + raises=TypeError) @pytest.mark.pandas @pytest.mark.parquet def test_read_write_parquet_files_with_uri(self): @@ -360,19 +366,13 @@ def test_read_write_parquet_files_with_uri(self): pq.write_table(table, path, filesystem=self.hdfs) - result = pq.read_table( - path, filesystem=self.hdfs, use_legacy_dataset=True - ).to_pandas() + result = pq.read_table(path, filesystem=self.hdfs).to_pandas() assert_frame_equal(result, df) - @pytest.mark.parquet - @pytest.mark.pandas - def test_read_common_metadata_files(self): - tmpdir = pjoin(self.tmp_path, 'common-metadata-' + guid()) - self.hdfs.mkdir(tmpdir) - _test_read_common_metadata_files(self.hdfs, tmpdir) - + @pytest.mark.xfail(reason="legacy.FileSystem not supported with ParquetDataset " + "due to legacy path being removed in PyArrow 15.0.0.", + raises=TypeError) @pytest.mark.parquet @pytest.mark.pandas def test_write_to_dataset_with_partitions(self): @@ -381,6 +381,9 @@ def test_write_to_dataset_with_partitions(self): _test_write_to_dataset_with_partitions( tmpdir, filesystem=self.hdfs) + @pytest.mark.xfail(reason="legacy.FileSystem not supported with ParquetDataset " + "due to legacy path being removed in PyArrow 15.0.0.", + raises=TypeError) @pytest.mark.parquet @pytest.mark.pandas def test_write_to_dataset_no_partitions(self): From 2abb3fb7095241300e2bb2aadd953b0f23970237 Mon Sep 17 00:00:00 2001 From: "Rossi(Ruoxi) Sun" Date: Thu, 21 Dec 2023 14:14:45 -0800 Subject: [PATCH 090/570] GH-32570: [C++] Fix the issue of `ExecBatchBuilder` when appending consecutive tail rows with the same id may exceed buffer boundary (#39234) ### Rationale for this change Addressed in https://github.com/apache/arrow/issues/32570#issuecomment-1856473812 ### What changes are included in this PR? 1. Skip consecutive rows with the same id when calculating rows to skip when appending to `ExecBatchBuilder`. 2. Fix the bug that column offset is neglected when calculating rows to skip. ### Are these changes tested? Yes. New UT included and the change is also protected by the existing case mentioned in the issue. ### Are there any user-facing changes? No. **This PR contains a "Critical Fix".** Because #32570 is labeled critical, and causes a crash even when the API contract is upheld. * Closes: #32570 Authored-by: zanmato Signed-off-by: Antoine Pitrou --- cpp/src/arrow/compute/light_array.cc | 7 ++++-- cpp/src/arrow/compute/light_array.h | 4 +++- cpp/src/arrow/compute/light_array_test.cc | 26 +++++++++++++++++++++++ 3 files changed, 34 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/compute/light_array.cc b/cpp/src/arrow/compute/light_array.cc index 4e8b2b2d7cc3a..93a054de1957c 100644 --- a/cpp/src/arrow/compute/light_array.cc +++ b/cpp/src/arrow/compute/light_array.cc @@ -398,9 +398,12 @@ int ExecBatchBuilder::NumRowsToSkip(const std::shared_ptr& column, } else { --num_rows_left; int row_id_removed = row_ids[num_rows_left]; - const uint32_t* offsets = - reinterpret_cast(column->buffers[1]->data()); + const int32_t* offsets = column->GetValues(1); num_bytes_skipped += offsets[row_id_removed + 1] - offsets[row_id_removed]; + // Skip consecutive rows with the same id + while (num_rows_left > 0 && row_id_removed == row_ids[num_rows_left - 1]) { + --num_rows_left; + } } } diff --git a/cpp/src/arrow/compute/light_array.h b/cpp/src/arrow/compute/light_array.h index 87f6b6c76a12c..84aa86d64bb62 100644 --- a/cpp/src/arrow/compute/light_array.h +++ b/cpp/src/arrow/compute/light_array.h @@ -416,7 +416,9 @@ class ARROW_EXPORT ExecBatchBuilder { // without checking buffer bounds (useful with SIMD or fixed size memory loads // and stores). // - // The sequence of row_ids provided must be non-decreasing. + // The sequence of row_ids provided must be non-decreasing. In case of consecutive rows + // with the same row id, they are skipped all at once because they occupy the same + // space. // static int NumRowsToSkip(const std::shared_ptr& column, int num_rows, const uint16_t* row_ids, int num_tail_bytes_to_skip); diff --git a/cpp/src/arrow/compute/light_array_test.cc b/cpp/src/arrow/compute/light_array_test.cc index 4e33f7b578ea8..52121530fe91d 100644 --- a/cpp/src/arrow/compute/light_array_test.cc +++ b/cpp/src/arrow/compute/light_array_test.cc @@ -471,6 +471,32 @@ TEST(ExecBatchBuilder, AppendBatchesSomeRows) { ASSERT_EQ(0, pool->bytes_allocated()); } +TEST(ExecBatchBuilder, AppendBatchDupRows) { + std::unique_ptr owned_pool = MemoryPool::CreateDefault(); + MemoryPool* pool = owned_pool.get(); + // Case of cross-word copying for the last row, which may exceed the buffer boundary. + // This is a simplified case of GH-32570 + { + // 64-byte data fully occupying one minimal 64-byte aligned memory region. + ExecBatch batch_string = JSONToExecBatch({binary()}, R"([["123456789ABCDEF0"], + ["123456789ABCDEF0"], + ["123456789ABCDEF0"], + ["ABCDEF0"], + ["123456789"]])"); // 9-byte tail row, larger than a word. + ASSERT_EQ(batch_string[0].array()->buffers[1]->capacity(), 64); + ASSERT_EQ(batch_string[0].array()->buffers[2]->capacity(), 64); + ExecBatchBuilder builder; + uint16_t row_ids[2] = {4, 4}; + ASSERT_OK(builder.AppendSelected(pool, batch_string, 2, row_ids, /*num_cols=*/1)); + ExecBatch built = builder.Flush(); + ExecBatch batch_string_appended = + JSONToExecBatch({binary()}, R"([["123456789"], ["123456789"]])"); + ASSERT_EQ(batch_string_appended, built); + ASSERT_NE(0, pool->bytes_allocated()); + } + ASSERT_EQ(0, pool->bytes_allocated()); +} + TEST(ExecBatchBuilder, AppendBatchesSomeCols) { std::unique_ptr owned_pool = MemoryPool::CreateDefault(); MemoryPool* pool = owned_pool.get(); From 929c40bcbded7184a5f6894db208f16975de4d37 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Fri, 22 Dec 2023 00:37:29 +0000 Subject: [PATCH 091/570] GH-39343: [C++][FS][Azure] Add client secret auth configuration (#39346) ### Rationale for this change Client is a useful Azure authentication ### What changes are included in this PR? Implement `AzureOptions::ConfigureClientSecretCredential` ### Are these changes tested? Simple unittest ### Are there any user-facing changes? Client secret auth is now supported on the Azure filesystem. * Closes: #39343 Authored-by: Thomas Newton Signed-off-by: Sutou Kouhei --- cpp/src/arrow/filesystem/azurefs.cc | 10 ++++++++++ cpp/src/arrow/filesystem/azurefs.h | 5 +++++ cpp/src/arrow/filesystem/azurefs_test.cc | 7 +++++++ 3 files changed, 22 insertions(+) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 27bdb5092a3ea..26c2761886050 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -113,6 +113,16 @@ Status AzureOptions::ConfigureAccountKeyCredential(const std::string& account_na return Status::OK(); } +Status AzureOptions::ConfigureClientSecretCredential(const std::string& account_name, + const std::string& tenant_id, + const std::string& client_id, + const std::string& client_secret) { + credential_kind_ = CredentialKind::kTokenCredential; + token_credential_ = std::make_shared( + tenant_id, client_id, client_secret); + return Status::OK(); +} + Status AzureOptions::ConfigureDefaultCredential(const std::string& account_name) { credential_kind_ = CredentialKind::kTokenCredential; token_credential_ = std::make_shared(); diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h index 69f6295237043..346dd349e935c 100644 --- a/cpp/src/arrow/filesystem/azurefs.h +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -110,6 +110,11 @@ struct ARROW_EXPORT AzureOptions { Status ConfigureAccountKeyCredential(const std::string& account_name, const std::string& account_key); + Status ConfigureClientSecretCredential(const std::string& account_name, + const std::string& tenant_id, + const std::string& client_id, + const std::string& client_secret); + bool Equals(const AzureOptions& other) const; std::string AccountBlobUrl(const std::string& account_name) const; diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 3266c1bfda2dc..62c5ef2232045 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -271,6 +271,13 @@ class AzureHierarchicalNSEnv : public AzureEnvImpl { bool WithHierarchicalNamespace() const final { return true; } }; +TEST(AzureFileSystem, InitializeFilesystemWithClientSecretCredential) { + AzureOptions options; + ARROW_EXPECT_OK(options.ConfigureClientSecretCredential( + "dummy-account-name", "tenant_id", "client_id", "client_secret")); + EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options)); +} + TEST(AzureFileSystem, InitializeFilesystemWithDefaultCredential) { AzureOptions options; ARROW_EXPECT_OK(options.ConfigureDefaultCredential("dummy-account-name")); From 51970e066e69ab01f9bdcc81219781ae07b9799b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Fri, 22 Dec 2023 02:06:50 +0100 Subject: [PATCH 092/570] GH-39006: [Python] Extract libparquet requirements out of libarrow_python.so to new libarrow_python_parquet_encryption.so (#39316) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change If I build pyarrow with everything and then I remove some of the Arrow CPP .so in order to have a minimal build I can't import pyarrow because it requires libarrow and libparquet. This is relevant in order to have a minimal build for Conda. Please see the related issue for more information. ### What changes are included in this PR? Move libarrow parquet encryption for pyarrow to its own shared object. ### Are these changes tested? I will run extensive CI with extra python archery tests. ### Are there any user-facing changes? No, and yes :) There will be a new .so on pyarrow but shouldn't be relevant in my opinion. * Closes: #39006 Lead-authored-by: Raúl Cumplido Co-authored-by: Antoine Pitrou Signed-off-by: Sutou Kouhei --- ci/scripts/python_test.sh | 2 + ci/scripts/python_wheel_unix_test.sh | 1 + ci/scripts/python_wheel_windows_test.bat | 1 + python/CMakeLists.txt | 38 ++++++++++--------- .../src/arrow/python/parquet_encryption.h | 33 +++++++++++++--- 5 files changed, 53 insertions(+), 22 deletions(-) diff --git a/ci/scripts/python_test.sh b/ci/scripts/python_test.sh index 8d818346faa6e..341c2dd0577ef 100755 --- a/ci/scripts/python_test.sh +++ b/ci/scripts/python_test.sh @@ -45,6 +45,7 @@ export ARROW_DEBUG_MEMORY_POOL=trap : ${PYARROW_TEST_HDFS:=${ARROW_HDFS:-ON}} : ${PYARROW_TEST_ORC:=${ARROW_ORC:-ON}} : ${PYARROW_TEST_PARQUET:=${ARROW_PARQUET:-ON}} +: ${PYARROW_TEST_PARQUET_ENCRYPTION:=${PARQUET_REQUIRE_ENCRYPTION:-ON}} : ${PYARROW_TEST_S3:=${ARROW_S3:-ON}} export PYARROW_TEST_ACERO @@ -56,6 +57,7 @@ export PYARROW_TEST_GCS export PYARROW_TEST_HDFS export PYARROW_TEST_ORC export PYARROW_TEST_PARQUET +export PYARROW_TEST_PARQUET_ENCRYPTION export PYARROW_TEST_S3 # Testing PyArrow diff --git a/ci/scripts/python_wheel_unix_test.sh b/ci/scripts/python_wheel_unix_test.sh index a6cc3bb7b29b7..01250ff7ef40c 100755 --- a/ci/scripts/python_wheel_unix_test.sh +++ b/ci/scripts/python_wheel_unix_test.sh @@ -46,6 +46,7 @@ export PYARROW_TEST_HDFS=ON export PYARROW_TEST_ORC=ON export PYARROW_TEST_PANDAS=ON export PYARROW_TEST_PARQUET=ON +export PYARROW_TEST_PARQUET_ENCRYPTION=ON export PYARROW_TEST_SUBSTRAIT=${ARROW_SUBSTRAIT} export PYARROW_TEST_S3=${ARROW_S3} export PYARROW_TEST_TENSORFLOW=ON diff --git a/ci/scripts/python_wheel_windows_test.bat b/ci/scripts/python_wheel_windows_test.bat index c73b0cfd1b9bd..b14bfddfb36d3 100755 --- a/ci/scripts/python_wheel_windows_test.bat +++ b/ci/scripts/python_wheel_windows_test.bat @@ -26,6 +26,7 @@ set PYARROW_TEST_GCS=ON set PYARROW_TEST_HDFS=ON set PYARROW_TEST_ORC=OFF set PYARROW_TEST_PARQUET=ON +set PYARROW_TEST_PARQUET_ENCRYPTION=ON set PYARROW_TEST_SUBSTRAIT=ON set PYARROW_TEST_S3=OFF set PYARROW_TEST_TENSORFLOW=ON diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 3f810d27271e5..2df1e67b9f4c7 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -332,22 +332,6 @@ if(PYARROW_BUILD_PARQUET OR PYARROW_BUILD_PARQUET_ENCRYPTION) find_package(Parquet REQUIRED) endif() -if(PYARROW_BUILD_PARQUET_ENCRYPTION) - if(PARQUET_REQUIRE_ENCRYPTION) - list(APPEND PYARROW_CPP_SRCS ${PYARROW_CPP_SOURCE_DIR}/parquet_encryption.cc) - if(ARROW_BUILD_SHARED) - list(APPEND PYARROW_CPP_LINK_LIBS Parquet::parquet_shared) - else() - list(APPEND PYARROW_CPP_LINK_LIBS Parquet::parquet_static) - endif() - message(STATUS "Parquet Encryption Enabled") - else() - message(FATAL_ERROR "You must build Arrow C++ with PARQUET_REQUIRE_ENCRYPTION=ON") - endif() -else() - message(STATUS "Parquet Encryption is NOT Enabled") -endif() - if(PYARROW_BUILD_HDFS) if(NOT ARROW_HDFS) message(FATAL_ERROR "You must build Arrow C++ with ARROW_HDFS=ON") @@ -391,6 +375,26 @@ install(TARGETS arrow_python LIBRARY DESTINATION . RUNTIME DESTINATION .) +set(PYARROW_CPP_ENCRYPTION_SRCS ${PYARROW_CPP_SOURCE_DIR}/parquet_encryption.cc) +if(NOT PYARROW_BUILD_PARQUET_ENCRYPTION) + message(STATUS "Parquet Encryption is NOT Enabled") +else() + if(PARQUET_REQUIRE_ENCRYPTION) + add_library(arrow_python_parquet_encryption SHARED ${PYARROW_CPP_ENCRYPTION_SRCS}) + target_link_libraries(arrow_python_parquet_encryption PUBLIC arrow_python + ${PARQUET_LINK_LIBS}) + target_compile_definitions(arrow_python_parquet_encryption + PRIVATE ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORTING) + install(TARGETS arrow_python_parquet_encryption + ARCHIVE DESTINATION . + LIBRARY DESTINATION . + RUNTIME DESTINATION .) + message(STATUS "Parquet Encryption Enabled") + else() + message(FATAL_ERROR "You must build Arrow C++ with PARQUET_REQUIRE_ENCRYPTION=ON") + endif() +endif() + set(PYARROW_CPP_FLIGHT_SRCS ${PYARROW_CPP_SOURCE_DIR}/flight.cc) if(PYARROW_BUILD_FLIGHT) if(NOT ARROW_FLIGHT) @@ -814,6 +818,6 @@ endif() if(PYARROW_BUILD_PARQUET) target_link_libraries(_parquet PRIVATE ${PARQUET_LINK_LIBS}) if(PYARROW_BUILD_PARQUET_ENCRYPTION) - target_link_libraries(_parquet_encryption PRIVATE ${PARQUET_LINK_LIBS}) + target_link_libraries(_parquet_encryption PRIVATE arrow_python_parquet_encryption) endif() endif() diff --git a/python/pyarrow/src/arrow/python/parquet_encryption.h b/python/pyarrow/src/arrow/python/parquet_encryption.h index 23ee478348ecd..a1aaa30e260f5 100644 --- a/python/pyarrow/src/arrow/python/parquet_encryption.h +++ b/python/pyarrow/src/arrow/python/parquet_encryption.h @@ -26,6 +26,27 @@ #include "parquet/encryption/kms_client.h" #include "parquet/encryption/kms_client_factory.h" +#if defined(_WIN32) || defined(__CYGWIN__) // Windows +#if defined(_MSC_VER) +#pragma warning(disable : 4251) +#else +#pragma GCC diagnostic ignored "-Wattributes" +#endif + +#ifdef ARROW_PYTHON_STATIC +#define ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT +#elif defined(ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORTING) +#define ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT __declspec(dllexport) +#else +#define ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT __declspec(dllimport) +#endif + +#else // Not Windows +#ifndef ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT +#define ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT __attribute__((visibility("default"))) +#endif +#endif // Non-Windows + namespace arrow { namespace py { namespace parquet { @@ -33,7 +54,7 @@ namespace encryption { /// \brief A table of function pointers for calling from C++ into /// Python. -class ARROW_PYTHON_EXPORT PyKmsClientVtable { +class ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT PyKmsClientVtable { public: std::function @@ -44,7 +65,8 @@ class ARROW_PYTHON_EXPORT PyKmsClientVtable { }; /// \brief A helper for KmsClient implementation in Python. -class ARROW_PYTHON_EXPORT PyKmsClient : public ::parquet::encryption::KmsClient { +class ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT PyKmsClient + : public ::parquet::encryption::KmsClient { public: PyKmsClient(PyObject* handler, PyKmsClientVtable vtable); ~PyKmsClient() override; @@ -62,7 +84,7 @@ class ARROW_PYTHON_EXPORT PyKmsClient : public ::parquet::encryption::KmsClient /// \brief A table of function pointers for calling from C++ into /// Python. -class ARROW_PYTHON_EXPORT PyKmsClientFactoryVtable { +class ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT PyKmsClientFactoryVtable { public: std::function> SafeGetFileEncryptionProperties( From cd5a1bd259a95eb9342569fb01d41a5924aec30f Mon Sep 17 00:00:00 2001 From: Ravjot Brar <83892020+ravjotbrar@users.noreply.github.com> Date: Fri, 22 Dec 2023 07:03:32 -0800 Subject: [PATCH 093/570] GH-39014: [Java] Add default truststore along with KeychainStore when on Mac system (#39235) ### Rationale for this change As described in #39014, when using the system TrustStore on Mac, the certificates returned do not include Root CAs trusted by the system. This change adds the default KeyStore instance along with the KeyChainStore to include trusted Root CAs. The reason we add the default KeyStore instance is because there is no easy way to get the certificates from the System Roots keychain. ### What changes are included in this PR? I've updated ClientAuthenticationUtils to get the default KeyStore instance when the operating system is macOS and have updated the tests to include this change. ### Are these changes tested? See changes made in ClientAuthenticationUtilsTest.java. ### Are there any user-facing changes? No * Closes: #39014 Authored-by: Ravjot Brar Signed-off-by: David Li --- .../utils/ClientAuthenticationUtils.java | 21 ++++++---- .../utils/ClientAuthenticationUtilsTest.java | 42 +++++++++++++++++-- 2 files changed, 51 insertions(+), 12 deletions(-) diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/client/utils/ClientAuthenticationUtils.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/client/utils/ClientAuthenticationUtils.java index d50dc385a62e1..ffb0048181c7c 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/client/utils/ClientAuthenticationUtils.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/client/utils/ClientAuthenticationUtils.java @@ -115,6 +115,16 @@ static KeyStore getKeyStoreInstance(String instance) return keyStore; } + @VisibleForTesting + static KeyStore getDefaultKeyStoreInstance(String password) + throws KeyStoreException, CertificateException, NoSuchAlgorithmException, IOException { + try (InputStream fileInputStream = getKeystoreInputStream()) { + KeyStore keyStore = KeyStore.getInstance(KeyStore.getDefaultType()); + keyStore.load(fileInputStream, password == null ? null : password.toCharArray()); + return keyStore; + } + } + static String getOperatingSystem() { return System.getProperty("os.name"); } @@ -156,16 +166,9 @@ public static InputStream getCertificateInputStreamFromSystem(String password) t keyStoreList.add(getKeyStoreInstance("Windows-MY")); } else if (isMac()) { keyStoreList.add(getKeyStoreInstance("KeychainStore")); + keyStoreList.add(getDefaultKeyStoreInstance(password)); } else { - try (InputStream fileInputStream = getKeystoreInputStream()) { - KeyStore keyStore = KeyStore.getInstance(KeyStore.getDefaultType()); - if (password == null) { - keyStore.load(fileInputStream, null); - } else { - keyStore.load(fileInputStream, password.toCharArray()); - } - keyStoreList.add(keyStore); - } + keyStoreList.add(getDefaultKeyStoreInstance(password)); } return getCertificatesInputStream(keyStoreList); diff --git a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/client/utils/ClientAuthenticationUtilsTest.java b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/client/utils/ClientAuthenticationUtilsTest.java index 27bba64587367..b7977462e9c01 100644 --- a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/client/utils/ClientAuthenticationUtilsTest.java +++ b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/client/utils/ClientAuthenticationUtilsTest.java @@ -77,6 +77,33 @@ public void testGetKeyStoreInstance() throws IOException, } } + @Test + public void testGetDefaultKeyStoreInstancePassword() throws IOException, + KeyStoreException, CertificateException, NoSuchAlgorithmException { + try (MockedStatic keyStoreMockedStatic = Mockito.mockStatic(KeyStore.class)) { + + keyStoreMockedStatic + .when(() -> ClientAuthenticationUtils.getDefaultKeyStoreInstance("changeit")) + .thenReturn(keyStoreMock); + KeyStore receiveKeyStore = ClientAuthenticationUtils.getDefaultKeyStoreInstance("changeit"); + Assert.assertEquals(receiveKeyStore, keyStoreMock); + } + } + + @Test + public void testGetDefaultKeyStoreInstanceNoPassword() throws IOException, + KeyStoreException, CertificateException, NoSuchAlgorithmException { + try (MockedStatic keyStoreMockedStatic = Mockito.mockStatic(KeyStore.class)) { + + keyStoreMockedStatic + .when(() -> ClientAuthenticationUtils.getDefaultKeyStoreInstance(null)) + .thenReturn(keyStoreMock); + KeyStore receiveKeyStore = ClientAuthenticationUtils.getDefaultKeyStoreInstance(null); + Assert.assertEquals(receiveKeyStore, keyStoreMock); + } + } + + @Test public void testGetCertificateInputStreamFromMacSystem() throws IOException, KeyStoreException, CertificateException, NoSuchAlgorithmException { @@ -90,11 +117,18 @@ public void testGetCertificateInputStreamFromMacSystem() throws IOException, keyStoreMockedStatic.when(() -> ClientAuthenticationUtils .getKeyStoreInstance("KeychainStore")) .thenReturn(keyStoreMock); + keyStoreMockedStatic.when(() -> ClientAuthenticationUtils + .getDefaultKeyStoreInstance("changeit")) + .thenReturn(keyStoreMock); + clientAuthenticationUtilsMockedStatic + .when(ClientAuthenticationUtils::getKeystoreInputStream) + .thenCallRealMethod(); + keyStoreMockedStatic.when(KeyStore::getDefaultType).thenCallRealMethod(); keyStoreMockedStatic.when(() -> ClientAuthenticationUtils .getCertificatesInputStream(Mockito.any())) .thenReturn(mock); - InputStream inputStream = ClientAuthenticationUtils.getCertificateInputStreamFromSystem("test"); + InputStream inputStream = ClientAuthenticationUtils.getCertificateInputStreamFromSystem("changeit"); Assert.assertEquals(inputStream, mock); } } @@ -136,9 +170,11 @@ public void testGetCertificateInputStreamFromLinuxSystem() throws IOException, setOperatingSystemMock(clientAuthenticationUtilsMockedStatic, false, false); keyStoreMockedStatic.when(() -> ClientAuthenticationUtils - .getCertificatesInputStream(Mockito.any())) + .getCertificatesInputStream(Mockito.any())) .thenReturn(mock); - + keyStoreMockedStatic.when(() -> ClientAuthenticationUtils + .getDefaultKeyStoreInstance(Mockito.any())) + .thenReturn(keyStoreMock); clientAuthenticationUtilsMockedStatic .when(ClientAuthenticationUtils::getKeystoreInputStream) .thenCallRealMethod(); From a4a3d3f4825eb025657121e70c9d86e8d6ecff35 Mon Sep 17 00:00:00 2001 From: panbingkun Date: Fri, 22 Dec 2023 23:17:58 +0800 Subject: [PATCH 094/570] GH-39265: [Java] Make it run well with the netty newest version 4.1.104 (#39266) ### Describe the enhancement requested When I used `netty arrow memory 14.0.1` and `netty 4.1.104.Final` in Spark, the following error occurred, After pr: https://github.com/netty/netty/pull/13613, `PoolArena` no longer extends `SizeClasses`, but instead uses it as one of its fields, as follows: image in order to ensure that `netty arrow memory 14.0.1` works well with `netty 4.1.104.Final` version, I suggest making similar modifications here. 1.Compilation errors are as follows: https://github.com/panbingkun/spark/actions/runs/7237466030/job/19717162391 image 2.Some bugs have been fixed in `netty 4.1.104.Final` as follows: image image 4.1.104.Final release note: https://netty.io/news/2023/12/15/4-1-104-Final.html 4.1.103.Final release note: https://netty.io/news/2023/12/13/4-1-103-Final.html 4.1.101.Final release note: https://netty.io/news/2023/11/09/4-1-101-Final.html ### Component(s) Java * Closes: #39265 Authored-by: panbingkun Signed-off-by: David Li --- .../main/java/io/netty/buffer/PooledByteBufAllocatorL.java | 7 ++----- java/pom.xml | 2 +- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/java/memory/memory-netty/src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java b/java/memory/memory-netty/src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java index 06c6669cfd162..ba9aba353c351 100644 --- a/java/memory/memory-netty/src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java +++ b/java/memory/memory-netty/src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java @@ -71,7 +71,7 @@ public UnsafeDirectLittleEndian allocate(long size) { } public int getChunkSize() { - return allocator.chunkSize; + return allocator.chunkSize(); } public long getHugeBufferSize() { @@ -137,7 +137,6 @@ private class InnerAllocator extends PooledByteBufAllocator { private final PoolArena[] directArenas; private final MemoryStatusThread statusThread; - private final int chunkSize; public InnerAllocator() { super(true); @@ -150,8 +149,6 @@ public InnerAllocator() { throw new RuntimeException("Failure while initializing allocator. Unable to retrieve direct arenas field.", e); } - this.chunkSize = directArenas[0].chunkSize; - if (memoryLogger.isTraceEnabled()) { statusThread = new MemoryStatusThread(this); statusThread.start(); @@ -166,7 +163,7 @@ private UnsafeDirectLittleEndian newDirectBufferL(int initialCapacity, int maxCa if (directArena != null) { - if (initialCapacity > directArena.chunkSize) { + if (initialCapacity > chunkSize()) { // This is beyond chunk size so we'll allocate separately. ByteBuf buf = UnpooledByteBufAllocator.DEFAULT.directBuffer(initialCapacity, maxCapacity); diff --git a/java/pom.xml b/java/pom.xml index 75e0946f10811..4cca5e7245f0f 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -33,7 +33,7 @@ 5.10.1 2.0.9 32.1.3-jre - 4.1.100.Final + 4.1.104.Final 1.60.0 3.23.1 2.16.0 From 87971df049c09671bae8a207fe2b29704fe21e8d Mon Sep 17 00:00:00 2001 From: John Garland Date: Sat, 23 Dec 2023 04:27:20 +1100 Subject: [PATCH 095/570] GH-39335: [C#] Support creating FlightClient with Grpc.Core.Channel (#39348) as well as Grpc.Net.Client.GrpcChannel by changing our constructor arg to Grpc.Core.ChannelBase which both classes inherit from. ### Rationale for this change ### What changes are included in this PR? Changing the constructor of C#'s Flight Client to take in a ChannelBase which allows for multiple implementations of gRPC channels to be passed in. ### Are these changes tested? Existing tests already cover the use but have also manually tested in a separate app ( ### Are there any user-facing changes? No as we're just changing the constructor to take in a parent/base class instead. * Closes: #39335 Authored-by: John Garland Signed-off-by: Curt Hagenlocher --- csharp/src/Apache.Arrow.Flight/Client/FlightClient.cs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/csharp/src/Apache.Arrow.Flight/Client/FlightClient.cs b/csharp/src/Apache.Arrow.Flight/Client/FlightClient.cs index 5dc0d1b434b6d..a7c459935c240 100644 --- a/csharp/src/Apache.Arrow.Flight/Client/FlightClient.cs +++ b/csharp/src/Apache.Arrow.Flight/Client/FlightClient.cs @@ -16,10 +16,8 @@ using System.Threading.Tasks; using Apache.Arrow.Flight.Internal; using Apache.Arrow.Flight.Protocol; -using Apache.Arrow.Flight.Server; using Apache.Arrow.Flight.Server.Internal; using Grpc.Core; -using Grpc.Net.Client; namespace Apache.Arrow.Flight.Client { @@ -29,7 +27,7 @@ public class FlightClient private readonly FlightService.FlightServiceClient _client; - public FlightClient(GrpcChannel grpcChannel) + public FlightClient(ChannelBase grpcChannel) { _client = new FlightService.FlightServiceClient(grpcChannel); } From 7b71156d99557168d46292c010f82b812947ffb8 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 22 Dec 2023 17:02:31 -0400 Subject: [PATCH 096/570] GH-39138: [R] Fix implicit conversion warnings (#39250) ### Rationale for this change We have failing CRAN checks because this warning occurs on one check machine. ### What changes are included in this PR? Implicit integer casts are made explicit and/or variable declarations were fixed so that fewer implicit integer casts were performed. Fully solving the warnings also requires https://github.com/r-lib/cpp11/pull/349 since some errors occur in those headers. ### Are these changes tested? This particular test we can't do on CI because the MacOS runner we have doesn't have a new enough `clang` to support the requisite `-W` flags. I tested this locally by adding `PKG_CXXFLAGS=-Wconversion -Wno-sign-conversion -Wsign-compare -Werror` to `Makevars.in`. ### Are there any user-facing changes? No * Closes: #39138 Authored-by: Dewey Dunnington Signed-off-by: Dewey Dunnington --- r/src/altrep.cpp | 56 +++++++++++++++++++---------- r/src/array.cpp | 18 ++++++---- r/src/array_to_vector.cpp | 14 ++++---- r/src/arraydata.cpp | 12 +++---- r/src/arrowExports.cpp | 76 +++++++++++++++++++-------------------- r/src/arrow_cpp11.h | 14 +++++++- r/src/arrow_types.h | 4 +-- r/src/chunkedarray.cpp | 5 ++- r/src/compression.cpp | 2 +- r/src/compute.cpp | 15 ++++---- r/src/dataset.cpp | 4 +-- r/src/datatype.cpp | 2 +- r/src/io.cpp | 11 ++++-- r/src/message.cpp | 4 +-- r/src/r_to_arrow.cpp | 18 +++++----- r/src/recordbatch.cpp | 14 ++++---- r/src/schema.cpp | 4 +-- r/src/table.cpp | 16 ++++----- 18 files changed, 165 insertions(+), 124 deletions(-) diff --git a/r/src/altrep.cpp b/r/src/altrep.cpp index 9745393d01bbc..bdaac0a9ce5d2 100644 --- a/r/src/altrep.cpp +++ b/r/src/altrep.cpp @@ -275,7 +275,8 @@ struct AltrepVectorPrimitive : public AltrepVectorBase(R_ExternalPtrAddr(R_altrep_data1(alt))); auto resolve = altrep_data->locate(i); - const auto& array = altrep_data->chunked_array()->chunk(resolve.chunk_index); + const auto& array = + altrep_data->chunked_array()->chunk(static_cast(resolve.chunk_index)); auto j = resolve.index_in_chunk; return array->IsNull(j) ? cpp11::na() @@ -466,10 +467,10 @@ struct AltrepFactor : public AltrepVectorBase { std::unique_ptr unifier_ = ValueOrStop(DictionaryUnifier::Make(arr_type.value_type())); - size_t n_arrays = chunked_array->num_chunks(); + int n_arrays = chunked_array->num_chunks(); BufferVector arrays_transpose(n_arrays); - for (size_t i = 0; i < n_arrays; i++) { + for (int i = 0; i < n_arrays; i++) { const auto& dict_i = *internal::checked_cast(*chunked_array->chunk(i)) .dictionary(); @@ -559,17 +560,14 @@ struct AltrepFactor : public AltrepVectorBase { return dup; } - // The value at position i - static int Elt(SEXP alt, R_xlen_t i) { - if (Base::IsMaterialized(alt)) { - return INTEGER_ELT(Representation(alt), i); - } - + // The value at position i as an int64_t (to make bounds checking less verbose) + static int64_t Elt64(SEXP alt, R_xlen_t i) { auto altrep_data = reinterpret_cast(R_ExternalPtrAddr(R_altrep_data1(alt))); auto resolve = altrep_data->locate(i); - const auto& array = altrep_data->chunked_array()->chunk(resolve.chunk_index); + const auto& array = + altrep_data->chunked_array()->chunk(static_cast(resolve.chunk_index)); auto j = resolve.index_in_chunk; if (!array->IsNull(j)) { @@ -578,7 +576,7 @@ struct AltrepFactor : public AltrepVectorBase { if (WasUnified(alt)) { const auto* transpose_data = reinterpret_cast( - GetArrayTransposed(alt, resolve.chunk_index)->data()); + GetArrayTransposed(alt, static_cast(resolve.chunk_index))->data()); switch (indices->type_id()) { case Type::UINT8: @@ -617,7 +615,7 @@ struct AltrepFactor : public AltrepVectorBase { case Type::INT64: return indices->data()->GetValues(1)[j] + 1; case Type::UINT64: - return indices->data()->GetValues(1)[j] + 1; + return static_cast(indices->data()->GetValues(1)[j] + 1); default: break; } @@ -628,6 +626,18 @@ struct AltrepFactor : public AltrepVectorBase { return NA_INTEGER; } + // The value at position i as an int (which R needs because this is a factor) + static int Elt(SEXP alt, R_xlen_t i) { + if (Base::IsMaterialized(alt)) { + return INTEGER_ELT(Representation(alt), i); + } + + int64_t elt64 = Elt64(alt, i); + ARROW_R_DCHECK(elt64 == NA_INTEGER || elt64 >= 1); + ARROW_R_DCHECK(elt64 <= std::numeric_limits::max()); + return static_cast(elt64); + } + static R_xlen_t Get_region(SEXP alt, R_xlen_t start, R_xlen_t n, int* buf) { // If we have data2, we can just copy the region into buf // using the standard Get_region for this R type @@ -667,7 +677,7 @@ struct AltrepFactor : public AltrepVectorBase { // using the transpose data for this chunk const auto* transpose_data = reinterpret_cast(GetArrayTransposed(alt, j)->data()); - auto transpose = [transpose_data](int x) { return transpose_data[x]; }; + auto transpose = [transpose_data](int64_t x) { return transpose_data[x]; }; GetRegionDispatch(array, indices, transpose, out); @@ -677,7 +687,7 @@ struct AltrepFactor : public AltrepVectorBase { } else { // simpler case, identity transpose - auto transpose = [](int x) { return x; }; + auto transpose = [](int64_t x) { return static_cast(x); }; int* out = buf; for (const auto& array : slice->chunks()) { @@ -718,7 +728,13 @@ struct AltrepFactor : public AltrepVectorBase { VisitArraySpanInline( *array->data(), - /*valid_func=*/[&](index_type index) { *out++ = transpose(index) + 1; }, + /*valid_func=*/ + [&](index_type index) { + int64_t transposed = transpose(index) + 1; + ARROW_R_DCHECK(transposed >= 1); + ARROW_R_DCHECK(transposed <= std::numeric_limits::max()); + *out++ = static_cast(transposed); + }, /*null_func=*/[&]() { *out++ = cpp11::na(); }); } @@ -765,7 +781,8 @@ struct AltrepVectorString : public AltrepVectorBase> { bool no_nul = std::find(view_.begin(), view_.end(), '\0') == view_.end(); if (no_nul) { - return Rf_mkCharLenCE(view_.data(), view_.size(), CE_UTF8); + ARROW_R_DCHECK(view_.size() <= std::numeric_limits::max()); + return Rf_mkCharLenCE(view_.data(), static_cast(view_.size()), CE_UTF8); } else if (strip_out_nuls_) { return ConvertStripNul(); } else { @@ -802,7 +819,9 @@ struct AltrepVectorString : public AltrepVectorBase> { } nul_was_stripped_ = true; - return Rf_mkCharLenCE(stripped_string_.data(), stripped_len, CE_UTF8); + ARROW_R_DCHECK(stripped_len <= std::numeric_limits::max()); + return Rf_mkCharLenCE(stripped_string_.data(), static_cast(stripped_len), + CE_UTF8); } bool nul_was_stripped() const { return nul_was_stripped_; } @@ -847,7 +866,8 @@ struct AltrepVectorString : public AltrepVectorBase> { auto altrep_data = reinterpret_cast(R_ExternalPtrAddr(R_altrep_data1(alt))); auto resolve = altrep_data->locate(i); - const auto& array = altrep_data->chunked_array()->chunk(resolve.chunk_index); + const auto& array = + altrep_data->chunked_array()->chunk(static_cast(resolve.chunk_index)); auto j = resolve.index_in_chunk; SEXP s = NA_STRING; diff --git a/r/src/array.cpp b/r/src/array.cpp index ae76c01a94910..38406e494d67b 100644 --- a/r/src/array.cpp +++ b/r/src/array.cpp @@ -92,7 +92,7 @@ std::shared_ptr Array__Slice2(const std::shared_ptr& return array->Slice(offset, length); } -void arrow::r::validate_index(int i, int len) { +void arrow::r::validate_index(int64_t i, int64_t len) { if (i == NA_INTEGER) { cpp11::stop("'i' cannot be NA"); } @@ -119,10 +119,14 @@ r_vec_size Array__length(const std::shared_ptr& x) { } // [[arrow::export]] -int Array__offset(const std::shared_ptr& x) { return x->offset(); } +r_vec_size Array__offset(const std::shared_ptr& x) { + return r_vec_size(x->offset()); +} // [[arrow::export]] -int Array__null_count(const std::shared_ptr& x) { return x->null_count(); } +r_vec_size Array__null_count(const std::shared_ptr& x) { + return r_vec_size(x->null_count()); +} // [[arrow::export]] std::shared_ptr Array__type(const std::shared_ptr& x) { @@ -263,9 +267,9 @@ r_vec_size LargeListArray__value_length( } // [[arrow::export]] -r_vec_size FixedSizeListArray__value_length( +int FixedSizeListArray__value_length( const std::shared_ptr& array, int64_t i) { - return r_vec_size(array->value_length(i)); + return array->value_length(i); } // [[arrow::export]] @@ -294,10 +298,10 @@ cpp11::writable::integers ListArray__raw_value_offsets( } // [[arrow::export]] -cpp11::writable::integers LargeListArray__raw_value_offsets( +cpp11::writable::doubles LargeListArray__raw_value_offsets( const std::shared_ptr& array) { auto offsets = array->raw_value_offsets(); - return cpp11::writable::integers(offsets, offsets + array->length()); + return cpp11::writable::doubles(offsets, offsets + array->length()); } // [[arrow::export]] diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp index bf026d2723a1a..2f0508eb7a47a 100644 --- a/r/src/array_to_vector.cpp +++ b/r/src/array_to_vector.cpp @@ -375,7 +375,7 @@ struct Converter_String : public Converter { private: static SEXP r_string_from_view(std::string_view view) { - return Rf_mkCharLenCE(view.data(), view.size(), CE_UTF8); + return Rf_mkCharLenCE(view.data(), static_cast(view.size()), CE_UTF8); } static SEXP r_string_from_view_strip_nul(std::string_view view, @@ -576,10 +576,10 @@ class Converter_Dictionary : public Converter { const auto& arr_type = checked_cast(*chunked_array->type()); unifier_ = ValueOrStop(DictionaryUnifier::Make(arr_type.value_type())); - size_t n_arrays = chunked_array->num_chunks(); + int n_arrays = chunked_array->num_chunks(); arrays_transpose_.resize(n_arrays); - for (size_t i = 0; i < n_arrays; i++) { + for (int i = 0; i < n_arrays; i++) { const auto& dict_i = *checked_cast(*chunked_array->chunk(i)).dictionary(); StopIfNotOk(unifier_->Unify(dict_i, &arrays_transpose_[i])); @@ -748,7 +748,7 @@ class Converter_Struct : public Converter { auto colnames = arrow::r::to_r_strings( type->fields(), [](const std::shared_ptr& field) { return field->name(); }); - out.attr(symbols::row_names) = arrow::r::short_row_names(n); + out.attr(symbols::row_names) = arrow::r::short_row_names(static_cast(n)); out.attr(R_NamesSymbol) = colnames; out.attr(R_ClassSymbol) = arrow::r::data::classes_tbl_df; @@ -756,7 +756,7 @@ class Converter_Struct : public Converter { } Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const { - int nf = converters.size(); + int nf = static_cast(converters.size()); for (int i = 0; i < nf; i++) { SEXP data_i = VECTOR_ELT(data, i); @@ -771,7 +771,7 @@ class Converter_Struct : public Converter { Status Ingest_some_nulls(SEXP data, const std::shared_ptr& array, R_xlen_t start, R_xlen_t n, size_t chunk_index) const { auto struct_array = checked_cast(array.get()); - int nf = converters.size(); + int nf = static_cast(converters.size()); // Flatten() deals with merging of nulls auto arrays = ValueOrStop(struct_array->Flatten(gc_memory_pool())); for (int i = 0; i < nf; i++) { @@ -1384,7 +1384,7 @@ cpp11::writable::list to_data_frame(const std::shared_ptr& data, tbl.attr(R_NamesSymbol) = names; tbl.attr(R_ClassSymbol) = arrow::r::data::classes_tbl_df; - tbl.attr(R_RowNamesSymbol) = arrow::r::short_row_names(nr); + tbl.attr(R_RowNamesSymbol) = arrow::r::short_row_names(static_cast(nr)); return tbl; } diff --git a/r/src/arraydata.cpp b/r/src/arraydata.cpp index cdab38f1147aa..d879e807323af 100644 --- a/r/src/arraydata.cpp +++ b/r/src/arraydata.cpp @@ -26,18 +26,18 @@ std::shared_ptr ArrayData__get_type( } // [[arrow::export]] -int ArrayData__get_length(const std::shared_ptr& x) { - return x->length; +r_vec_size ArrayData__get_length(const std::shared_ptr& x) { + return r_vec_size(x->length); } // [[arrow::export]] -int ArrayData__get_null_count(const std::shared_ptr& x) { - return x->null_count; +r_vec_size ArrayData__get_null_count(const std::shared_ptr& x) { + return r_vec_size(x->null_count); } // [[arrow::export]] -int ArrayData__get_offset(const std::shared_ptr& x) { - return x->offset; +r_vec_size ArrayData__get_offset(const std::shared_ptr& x) { + return r_vec_size(x->offset); } // [[arrow::export]] diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 790207efce1d2..75e0f27b4002e 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -110,7 +110,7 @@ BEGIN_CPP11 END_CPP11 } // array.cpp -int Array__offset(const std::shared_ptr& x); +r_vec_size Array__offset(const std::shared_ptr& x); extern "C" SEXP _arrow_Array__offset(SEXP x_sexp){ BEGIN_CPP11 arrow::r::Input&>::type x(x_sexp); @@ -118,7 +118,7 @@ BEGIN_CPP11 END_CPP11 } // array.cpp -int Array__null_count(const std::shared_ptr& x); +r_vec_size Array__null_count(const std::shared_ptr& x); extern "C" SEXP _arrow_Array__null_count(SEXP x_sexp){ BEGIN_CPP11 arrow::r::Input&>::type x(x_sexp); @@ -315,7 +315,7 @@ BEGIN_CPP11 END_CPP11 } // array.cpp -r_vec_size FixedSizeListArray__value_length(const std::shared_ptr& array, int64_t i); +int FixedSizeListArray__value_length(const std::shared_ptr& array, int64_t i); extern "C" SEXP _arrow_FixedSizeListArray__value_length(SEXP array_sexp, SEXP i_sexp){ BEGIN_CPP11 arrow::r::Input&>::type array(array_sexp); @@ -359,7 +359,7 @@ BEGIN_CPP11 END_CPP11 } // array.cpp -cpp11::writable::integers LargeListArray__raw_value_offsets(const std::shared_ptr& array); +cpp11::writable::doubles LargeListArray__raw_value_offsets(const std::shared_ptr& array); extern "C" SEXP _arrow_LargeListArray__raw_value_offsets(SEXP array_sexp){ BEGIN_CPP11 arrow::r::Input&>::type array(array_sexp); @@ -467,7 +467,7 @@ BEGIN_CPP11 END_CPP11 } // arraydata.cpp -int ArrayData__get_length(const std::shared_ptr& x); +r_vec_size ArrayData__get_length(const std::shared_ptr& x); extern "C" SEXP _arrow_ArrayData__get_length(SEXP x_sexp){ BEGIN_CPP11 arrow::r::Input&>::type x(x_sexp); @@ -475,7 +475,7 @@ BEGIN_CPP11 END_CPP11 } // arraydata.cpp -int ArrayData__get_null_count(const std::shared_ptr& x); +r_vec_size ArrayData__get_null_count(const std::shared_ptr& x); extern "C" SEXP _arrow_ArrayData__get_null_count(SEXP x_sexp){ BEGIN_CPP11 arrow::r::Input&>::type x(x_sexp); @@ -483,7 +483,7 @@ BEGIN_CPP11 END_CPP11 } // arraydata.cpp -int ArrayData__get_offset(const std::shared_ptr& x); +r_vec_size ArrayData__get_offset(const std::shared_ptr& x); extern "C" SEXP _arrow_ArrayData__get_offset(SEXP x_sexp){ BEGIN_CPP11 arrow::r::Input&>::type x(x_sexp); @@ -765,7 +765,7 @@ BEGIN_CPP11 END_CPP11 } // chunkedarray.cpp -r_vec_size ChunkedArray__num_chunks(const std::shared_ptr& chunked_array); +int ChunkedArray__num_chunks(const std::shared_ptr& chunked_array); extern "C" SEXP _arrow_ChunkedArray__num_chunks(SEXP chunked_array_sexp){ BEGIN_CPP11 arrow::r::Input&>::type chunked_array(chunked_array_sexp); @@ -869,11 +869,11 @@ BEGIN_CPP11 END_CPP11 } // compression.cpp -std::shared_ptr util___Codec__Create(arrow::Compression::type codec, R_xlen_t compression_level); +std::shared_ptr util___Codec__Create(arrow::Compression::type codec, int compression_level); extern "C" SEXP _arrow_util___Codec__Create(SEXP codec_sexp, SEXP compression_level_sexp){ BEGIN_CPP11 arrow::r::Input::type codec(codec_sexp); - arrow::r::Input::type compression_level(compression_level_sexp); + arrow::r::Input::type compression_level(compression_level_sexp); return cpp11::as_sexp(util___Codec__Create(codec, compression_level)); END_CPP11 } @@ -2024,14 +2024,14 @@ extern "C" SEXP _arrow_dataset___JsonFragmentScanOptions__Make(SEXP parse_option // dataset.cpp #if defined(ARROW_R_WITH_DATASET) -std::shared_ptr dataset___ParquetFragmentScanOptions__Make(bool use_buffered_stream, int64_t buffer_size, bool pre_buffer, int64_t thrift_string_size_limit, int64_t thrift_container_size_limit); +std::shared_ptr dataset___ParquetFragmentScanOptions__Make(bool use_buffered_stream, int64_t buffer_size, bool pre_buffer, int32_t thrift_string_size_limit, int32_t thrift_container_size_limit); extern "C" SEXP _arrow_dataset___ParquetFragmentScanOptions__Make(SEXP use_buffered_stream_sexp, SEXP buffer_size_sexp, SEXP pre_buffer_sexp, SEXP thrift_string_size_limit_sexp, SEXP thrift_container_size_limit_sexp){ BEGIN_CPP11 arrow::r::Input::type use_buffered_stream(use_buffered_stream_sexp); arrow::r::Input::type buffer_size(buffer_size_sexp); arrow::r::Input::type pre_buffer(pre_buffer_sexp); - arrow::r::Input::type thrift_string_size_limit(thrift_string_size_limit_sexp); - arrow::r::Input::type thrift_container_size_limit(thrift_container_size_limit_sexp); + arrow::r::Input::type thrift_string_size_limit(thrift_string_size_limit_sexp); + arrow::r::Input::type thrift_container_size_limit(thrift_container_size_limit_sexp); return cpp11::as_sexp(dataset___ParquetFragmentScanOptions__Make(use_buffered_stream, buffer_size, pre_buffer, thrift_string_size_limit, thrift_container_size_limit)); END_CPP11 } @@ -2567,10 +2567,10 @@ BEGIN_CPP11 END_CPP11 } // datatype.cpp -std::shared_ptr FixedSizeBinary__initialize(R_xlen_t byte_width); +std::shared_ptr FixedSizeBinary__initialize(int32_t byte_width); extern "C" SEXP _arrow_FixedSizeBinary__initialize(SEXP byte_width_sexp){ BEGIN_CPP11 - arrow::r::Input::type byte_width(byte_width_sexp); + arrow::r::Input::type byte_width(byte_width_sexp); return cpp11::as_sexp(FixedSizeBinary__initialize(byte_width)); END_CPP11 } @@ -3976,7 +3976,7 @@ BEGIN_CPP11 END_CPP11 } // message.cpp -r_vec_size ipc___Message__Verify(const std::unique_ptr& message); +bool ipc___Message__Verify(const std::unique_ptr& message); extern "C" SEXP _arrow_ipc___Message__Verify(SEXP message_sexp){ BEGIN_CPP11 arrow::r::Input&>::type message(message_sexp); @@ -4684,7 +4684,7 @@ BEGIN_CPP11 END_CPP11 } // recordbatch.cpp -r_vec_size RecordBatch__num_columns(const std::shared_ptr& x); +int RecordBatch__num_columns(const std::shared_ptr& x); extern "C" SEXP _arrow_RecordBatch__num_columns(SEXP x_sexp){ BEGIN_CPP11 arrow::r::Input&>::type x(x_sexp); @@ -4734,11 +4734,11 @@ BEGIN_CPP11 END_CPP11 } // recordbatch.cpp -std::shared_ptr RecordBatch__column(const std::shared_ptr& batch, R_xlen_t i); +std::shared_ptr RecordBatch__column(const std::shared_ptr& batch, int i); extern "C" SEXP _arrow_RecordBatch__column(SEXP batch_sexp, SEXP i_sexp){ BEGIN_CPP11 arrow::r::Input&>::type batch(batch_sexp); - arrow::r::Input::type i(i_sexp); + arrow::r::Input::type i(i_sexp); return cpp11::as_sexp(RecordBatch__column(batch, i)); END_CPP11 } @@ -4771,42 +4771,42 @@ BEGIN_CPP11 END_CPP11 } // recordbatch.cpp -std::shared_ptr RecordBatch__AddColumn(const std::shared_ptr& batch, R_xlen_t i, const std::shared_ptr& field, const std::shared_ptr& column); +std::shared_ptr RecordBatch__AddColumn(const std::shared_ptr& batch, int i, const std::shared_ptr& field, const std::shared_ptr& column); extern "C" SEXP _arrow_RecordBatch__AddColumn(SEXP batch_sexp, SEXP i_sexp, SEXP field_sexp, SEXP column_sexp){ BEGIN_CPP11 arrow::r::Input&>::type batch(batch_sexp); - arrow::r::Input::type i(i_sexp); + arrow::r::Input::type i(i_sexp); arrow::r::Input&>::type field(field_sexp); arrow::r::Input&>::type column(column_sexp); return cpp11::as_sexp(RecordBatch__AddColumn(batch, i, field, column)); END_CPP11 } // recordbatch.cpp -std::shared_ptr RecordBatch__SetColumn(const std::shared_ptr& batch, R_xlen_t i, const std::shared_ptr& field, const std::shared_ptr& column); +std::shared_ptr RecordBatch__SetColumn(const std::shared_ptr& batch, int i, const std::shared_ptr& field, const std::shared_ptr& column); extern "C" SEXP _arrow_RecordBatch__SetColumn(SEXP batch_sexp, SEXP i_sexp, SEXP field_sexp, SEXP column_sexp){ BEGIN_CPP11 arrow::r::Input&>::type batch(batch_sexp); - arrow::r::Input::type i(i_sexp); + arrow::r::Input::type i(i_sexp); arrow::r::Input&>::type field(field_sexp); arrow::r::Input&>::type column(column_sexp); return cpp11::as_sexp(RecordBatch__SetColumn(batch, i, field, column)); END_CPP11 } // recordbatch.cpp -std::shared_ptr RecordBatch__RemoveColumn(const std::shared_ptr& batch, R_xlen_t i); +std::shared_ptr RecordBatch__RemoveColumn(const std::shared_ptr& batch, int i); extern "C" SEXP _arrow_RecordBatch__RemoveColumn(SEXP batch_sexp, SEXP i_sexp){ BEGIN_CPP11 arrow::r::Input&>::type batch(batch_sexp); - arrow::r::Input::type i(i_sexp); + arrow::r::Input::type i(i_sexp); return cpp11::as_sexp(RecordBatch__RemoveColumn(batch, i)); END_CPP11 } // recordbatch.cpp -std::string RecordBatch__column_name(const std::shared_ptr& batch, R_xlen_t i); +std::string RecordBatch__column_name(const std::shared_ptr& batch, int i); extern "C" SEXP _arrow_RecordBatch__column_name(SEXP batch_sexp, SEXP i_sexp){ BEGIN_CPP11 arrow::r::Input&>::type batch(batch_sexp); - arrow::r::Input::type i(i_sexp); + arrow::r::Input::type i(i_sexp); return cpp11::as_sexp(RecordBatch__column_name(batch, i)); END_CPP11 } @@ -5346,7 +5346,7 @@ BEGIN_CPP11 END_CPP11 } // table.cpp -r_vec_size Table__num_columns(const std::shared_ptr& x); +int Table__num_columns(const std::shared_ptr& x); extern "C" SEXP _arrow_Table__num_columns(SEXP x_sexp){ BEGIN_CPP11 arrow::r::Input&>::type x(x_sexp); @@ -5379,20 +5379,20 @@ BEGIN_CPP11 END_CPP11 } // table.cpp -std::shared_ptr Table__column(const std::shared_ptr& table, R_xlen_t i); +std::shared_ptr Table__column(const std::shared_ptr& table, int i); extern "C" SEXP _arrow_Table__column(SEXP table_sexp, SEXP i_sexp){ BEGIN_CPP11 arrow::r::Input&>::type table(table_sexp); - arrow::r::Input::type i(i_sexp); + arrow::r::Input::type i(i_sexp); return cpp11::as_sexp(Table__column(table, i)); END_CPP11 } // table.cpp -std::shared_ptr Table__field(const std::shared_ptr& table, R_xlen_t i); +std::shared_ptr Table__field(const std::shared_ptr& table, int i); extern "C" SEXP _arrow_Table__field(SEXP table_sexp, SEXP i_sexp){ BEGIN_CPP11 arrow::r::Input&>::type table(table_sexp); - arrow::r::Input::type i(i_sexp); + arrow::r::Input::type i(i_sexp); return cpp11::as_sexp(Table__field(table, i)); END_CPP11 } @@ -5476,31 +5476,31 @@ BEGIN_CPP11 END_CPP11 } // table.cpp -std::shared_ptr Table__RemoveColumn(const std::shared_ptr& table, R_xlen_t i); +std::shared_ptr Table__RemoveColumn(const std::shared_ptr& table, int i); extern "C" SEXP _arrow_Table__RemoveColumn(SEXP table_sexp, SEXP i_sexp){ BEGIN_CPP11 arrow::r::Input&>::type table(table_sexp); - arrow::r::Input::type i(i_sexp); + arrow::r::Input::type i(i_sexp); return cpp11::as_sexp(Table__RemoveColumn(table, i)); END_CPP11 } // table.cpp -std::shared_ptr Table__AddColumn(const std::shared_ptr& table, R_xlen_t i, const std::shared_ptr& field, const std::shared_ptr& column); +std::shared_ptr Table__AddColumn(const std::shared_ptr& table, int i, const std::shared_ptr& field, const std::shared_ptr& column); extern "C" SEXP _arrow_Table__AddColumn(SEXP table_sexp, SEXP i_sexp, SEXP field_sexp, SEXP column_sexp){ BEGIN_CPP11 arrow::r::Input&>::type table(table_sexp); - arrow::r::Input::type i(i_sexp); + arrow::r::Input::type i(i_sexp); arrow::r::Input&>::type field(field_sexp); arrow::r::Input&>::type column(column_sexp); return cpp11::as_sexp(Table__AddColumn(table, i, field, column)); END_CPP11 } // table.cpp -std::shared_ptr Table__SetColumn(const std::shared_ptr& table, R_xlen_t i, const std::shared_ptr& field, const std::shared_ptr& column); +std::shared_ptr Table__SetColumn(const std::shared_ptr& table, int i, const std::shared_ptr& field, const std::shared_ptr& column); extern "C" SEXP _arrow_Table__SetColumn(SEXP table_sexp, SEXP i_sexp, SEXP field_sexp, SEXP column_sexp){ BEGIN_CPP11 arrow::r::Input&>::type table(table_sexp); - arrow::r::Input::type i(i_sexp); + arrow::r::Input::type i(i_sexp); arrow::r::Input&>::type field(field_sexp); arrow::r::Input&>::type column(column_sexp); return cpp11::as_sexp(Table__SetColumn(table, i, field, column)); diff --git a/r/src/arrow_cpp11.h b/r/src/arrow_cpp11.h index d8c4b719d1d3e..ab60586628164 100644 --- a/r/src/arrow_cpp11.h +++ b/r/src/arrow_cpp11.h @@ -27,6 +27,18 @@ #include "./nameof.h" +// Simple dcheck that doesn't use assert (i.e., won't crash the R session) +// Condition this on our own debug flag to avoid this ending up in any CRAN +// checks. +#if defined(ARROW_R_DEBUG) +#define ARROW_R_DCHECK(EXPR) \ + do { \ + if (!(EXPR)) Rf_error("Failed DCHECK: %s evaluated to false", #EXPR); \ + } while (false) +#else +#define ARROW_R_DCHECK(EXPR) +#endif + // borrowed from enc package // because R does not make these macros available (i.e. from Defn.h) #define UTF8_MASK (1 << 3) @@ -465,7 +477,7 @@ inline SEXP as_sexp(r_vec_size size) { if (x > std::numeric_limits::max()) { return Rf_ScalarReal(x); } else { - return Rf_ScalarInteger(x); + return Rf_ScalarInteger(static_cast(x)); } } diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h index fadc39c75fc06..05c8f6062dabb 100644 --- a/r/src/arrow_types.h +++ b/r/src/arrow_types.h @@ -189,13 +189,13 @@ void validate_slice_offset(R_xlen_t offset, int64_t len); void validate_slice_length(R_xlen_t length, int64_t available); -void validate_index(int i, int len); +void validate_index(int64_t i, int64_t len); template void TraverseDots(cpp11::list dots, int num_fields, Lambda lambda) { cpp11::strings names(dots.attr(R_NamesSymbol)); - for (R_xlen_t i = 0, j = 0; j < num_fields; i++) { + for (int i = 0, j = 0; j < num_fields; i++) { auto name_i = names[i]; if (name_i.size() == 0) { diff --git a/r/src/chunkedarray.cpp b/r/src/chunkedarray.cpp index 36884bb531b62..258013fc4da57 100644 --- a/r/src/chunkedarray.cpp +++ b/r/src/chunkedarray.cpp @@ -34,9 +34,8 @@ r_vec_size ChunkedArray__null_count( } // [[arrow::export]] -r_vec_size ChunkedArray__num_chunks( - const std::shared_ptr& chunked_array) { - return r_vec_size(chunked_array->num_chunks()); +int ChunkedArray__num_chunks(const std::shared_ptr& chunked_array) { + return chunked_array->num_chunks(); } // [[arrow::export]] diff --git a/r/src/compression.cpp b/r/src/compression.cpp index 148c6e14002f5..bc893afd8d28b 100644 --- a/r/src/compression.cpp +++ b/r/src/compression.cpp @@ -22,7 +22,7 @@ // [[arrow::export]] std::shared_ptr util___Codec__Create(arrow::Compression::type codec, - R_xlen_t compression_level) { + int compression_level) { return ValueOrStop(arrow::util::Codec::Create(codec, compression_level)); } diff --git a/r/src/compute.cpp b/r/src/compute.cpp index 87d1326ed3419..bd97e30005ca3 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -241,10 +241,10 @@ std::shared_ptr make_compute_options( interpolation); } if (!Rf_isNull(options["min_count"])) { - out->min_count = cpp11::as_cpp(options["min_count"]); + out->min_count = cpp11::as_cpp(options["min_count"]); } if (!Rf_isNull(options["skip_nulls"])) { - out->skip_nulls = cpp11::as_cpp(options["skip_nulls"]); + out->skip_nulls = cpp11::as_cpp(options["skip_nulls"]); } return out; } @@ -479,9 +479,9 @@ std::shared_ptr make_compute_options( func_name == "hash_stddev") { using Options = arrow::compute::VarianceOptions; auto out = std::make_shared(); - out->ddof = cpp11::as_cpp(options["ddof"]); + out->ddof = cpp11::as_cpp(options["ddof"]); if (!Rf_isNull(options["min_count"])) { - out->min_count = cpp11::as_cpp(options["min_count"]); + out->min_count = cpp11::as_cpp(options["min_count"]); } if (!Rf_isNull(options["skip_nulls"])) { out->skip_nulls = cpp11::as_cpp(options["skip_nulls"]); @@ -683,7 +683,7 @@ arrow::Status CallRScalarUDF(arrow::compute::KernelContext* context, } } - cpp11::sexp batch_length_sexp = cpp11::as_sexp(span.length); + cpp11::sexp batch_length_sexp = cpp11::as_sexp(static_cast(span.length)); std::shared_ptr output_type = result->type()->GetSharedPtr(); cpp11::sexp output_type_sexp = cpp11::to_r6(output_type); @@ -738,8 +738,7 @@ void RegisterScalarUDF(std::string name, cpp11::list func_sexp) { // Compute the Arity from the list of input kernels. We don't currently handle // variable numbers of arguments in a user-defined function. - int64_t n_args = - cpp11::as_cpp>(in_type_r[0])->num_fields(); + int n_args = cpp11::as_cpp>(in_type_r[0])->num_fields(); for (R_xlen_t i = 1; i < n_kernels; i++) { auto in_types = cpp11::as_cpp>(in_type_r[i]); if (in_types->num_fields() != n_args) { @@ -767,7 +766,7 @@ void RegisterScalarUDF(std::string name, cpp11::list func_sexp) { cpp11::sexp out_type_func = out_type_r[i]; std::vector compute_in_types(in_types->num_fields()); - for (int64_t j = 0; j < in_types->num_fields(); j++) { + for (int j = 0; j < in_types->num_fields(); j++) { compute_in_types[j] = arrow::compute::InputType(in_types->field(j)->type()); } diff --git a/r/src/dataset.cpp b/r/src/dataset.cpp index 83c430fb634d3..e53fc03bdb413 100644 --- a/r/src/dataset.cpp +++ b/r/src/dataset.cpp @@ -343,8 +343,8 @@ std::shared_ptr dataset___JsonFragmentScanOptions__ std::shared_ptr dataset___ParquetFragmentScanOptions__Make(bool use_buffered_stream, int64_t buffer_size, bool pre_buffer, - int64_t thrift_string_size_limit, - int64_t thrift_container_size_limit) { + int32_t thrift_string_size_limit, + int32_t thrift_container_size_limit) { auto options = std::make_shared(); if (use_buffered_stream) { options->reader_properties->enable_buffered_stream(); diff --git a/r/src/datatype.cpp b/r/src/datatype.cpp index f19ba92527157..2f2b89d658d91 100644 --- a/r/src/datatype.cpp +++ b/r/src/datatype.cpp @@ -201,7 +201,7 @@ std::shared_ptr DayTimeInterval__initialize() { } // [[arrow::export]] -std::shared_ptr FixedSizeBinary__initialize(R_xlen_t byte_width) { +std::shared_ptr FixedSizeBinary__initialize(int32_t byte_width) { if (byte_width == NA_INTEGER) { cpp11::stop("'byte_width' cannot be NA"); } diff --git a/r/src/io.cpp b/r/src/io.cpp index 321b1b17febc3..4d5ee31794ae8 100644 --- a/r/src/io.cpp +++ b/r/src/io.cpp @@ -253,11 +253,16 @@ class RConnectionFileInterface : public virtual arrow::io::FileInterface { return arrow::Status::IOError("R connection is closed"); } + if (nbytes > std::numeric_limits::max()) { + return arrow::Status::Invalid( + "Can't read more than INT_MAX bytes from an R connection"); + } + return SafeCallIntoR( [&] { cpp11::function read_bin = cpp11::package("base")["readBin"]; cpp11::writable::raws ptype((R_xlen_t)0); - cpp11::integers n = cpp11::as_sexp(nbytes); + cpp11::integers n = cpp11::as_sexp(static_cast(nbytes)); cpp11::sexp result = read_bin(connection_sexp_, ptype, n); @@ -512,8 +517,8 @@ struct ReencodeUTF8TransformFunctionWrapper { // UTF-16, and UTF-32. while (in_bytes_left > 0) { // Make enough place in the output to hopefully consume all of the input. - RETURN_NOT_OK( - builder.Reserve(std::max(in_bytes_left * kOversizeFactor, 4))); + RETURN_NOT_OK(builder.Reserve( + std::max(static_cast(in_bytes_left * kOversizeFactor), 4))); out_buf = builder.mutable_data() + builder.length(); out_bytes_left = builder.capacity() - builder.length(); diff --git a/r/src/message.cpp b/r/src/message.cpp index d9832ddc22a74..3f21873fea3b2 100644 --- a/r/src/message.cpp +++ b/r/src/message.cpp @@ -39,8 +39,8 @@ std::shared_ptr ipc___Message__body( } // [[arrow::export]] -r_vec_size ipc___Message__Verify(const std::unique_ptr& message) { - return r_vec_size(message->Verify()); +bool ipc___Message__Verify(const std::unique_ptr& message) { + return message->Verify(); } // [[arrow::export]] diff --git a/r/src/r_to_arrow.cpp b/r/src/r_to_arrow.cpp index d9bf848e24292..d2db11e14a787 100644 --- a/r/src/r_to_arrow.cpp +++ b/r/src/r_to_arrow.cpp @@ -335,7 +335,7 @@ struct RConvert { template static enable_if_integer> Convert(Type*, From from) { - return CIntFromRScalarImpl(from); + return CIntFromRScalarImpl(static_cast(from)); } // ---- convert R integer types to double @@ -461,7 +461,7 @@ class RPrimitiveConverter< if (std::is_same::value) { auto append_value = [this](r_value_type value) { - this->primitive_builder_->UnsafeAppend(value); + this->primitive_builder_->UnsafeAppend(static_cast(value)); return Status::OK(); }; return VisitVector(it, size, append_null, append_value); @@ -595,19 +595,21 @@ class RPrimitiveConverter::value>> return VisitVector(it, size, append_null, append_value); } - static int FromRDate(const Date32Type*, int from) { return from; } + static int FromRDate(const Date32Type*, double from) { return static_cast(from); } - static int64_t FromRDate(const Date64Type*, int from) { + static int64_t FromRDate(const Date64Type*, double from) { constexpr int64_t kMilliSecondsPerDay = 86400000; - return from * kMilliSecondsPerDay; + return static_cast(from * kMilliSecondsPerDay); } static int FromPosixct(const Date32Type*, double from) { constexpr int64_t kSecondsPerDay = 86400; - return from / kSecondsPerDay; + return static_cast(from / kSecondsPerDay); } - static int64_t FromPosixct(const Date64Type*, double from) { return from * 1000; } + static int64_t FromPosixct(const Date64Type*, double from) { + return static_cast(from * 1000); + } }; int64_t get_TimeUnit_multiplier(TimeUnit::type unit) { @@ -1081,7 +1083,7 @@ class RListConverter : public ListConverter { auto append_value = [this](SEXP value) { // TODO: if we decide that this can be run concurrently // we'll have to do vec_size() upfront - int n = arrow::r::vec_size(value); + R_xlen_t n = arrow::r::vec_size(value); RETURN_NOT_OK(this->list_builder_->ValidateOverflow(n)); RETURN_NOT_OK(this->list_builder_->Append()); diff --git a/r/src/recordbatch.cpp b/r/src/recordbatch.cpp index aca3a74fd81df..bf88e98ed1026 100644 --- a/r/src/recordbatch.cpp +++ b/r/src/recordbatch.cpp @@ -27,8 +27,8 @@ #include // [[arrow::export]] -r_vec_size RecordBatch__num_columns(const std::shared_ptr& x) { - return r_vec_size(x->num_columns()); +int RecordBatch__num_columns(const std::shared_ptr& x) { + return x->num_columns(); } // [[arrow::export]] @@ -80,7 +80,7 @@ cpp11::list RecordBatch__columns(const std::shared_ptr& batc // [[arrow::export]] std::shared_ptr RecordBatch__column( - const std::shared_ptr& batch, R_xlen_t i) { + const std::shared_ptr& batch, int i) { arrow::r::validate_index(i, batch->num_columns()); return batch->column(i); } @@ -106,7 +106,7 @@ bool RecordBatch__Equals(const std::shared_ptr& self, // [[arrow::export]] std::shared_ptr RecordBatch__AddColumn( - const std::shared_ptr& batch, R_xlen_t i, + const std::shared_ptr& batch, int i, const std::shared_ptr& field, const std::shared_ptr& column) { return ValueOrStop(batch->AddColumn(i, field, column)); @@ -114,7 +114,7 @@ std::shared_ptr RecordBatch__AddColumn( // [[arrow::export]] std::shared_ptr RecordBatch__SetColumn( - const std::shared_ptr& batch, R_xlen_t i, + const std::shared_ptr& batch, int i, const std::shared_ptr& field, const std::shared_ptr& column) { return ValueOrStop(batch->SetColumn(i, field, column)); @@ -122,14 +122,14 @@ std::shared_ptr RecordBatch__SetColumn( // [[arrow::export]] std::shared_ptr RecordBatch__RemoveColumn( - const std::shared_ptr& batch, R_xlen_t i) { + const std::shared_ptr& batch, int i) { arrow::r::validate_index(i, batch->num_columns()); return ValueOrStop(batch->RemoveColumn(i)); } // [[arrow::export]] std::string RecordBatch__column_name(const std::shared_ptr& batch, - R_xlen_t i) { + int i) { arrow::r::validate_index(i, batch->num_columns()); return batch->column_name(i); } diff --git a/r/src/schema.cpp b/r/src/schema.cpp index cf959707305a7..41d3d38d2eda3 100644 --- a/r/src/schema.cpp +++ b/r/src/schema.cpp @@ -29,14 +29,14 @@ std::shared_ptr Schema__from_fields( // [[arrow::export]] std::shared_ptr Schema__from_list(cpp11::list field_list) { - int n = field_list.size(); + R_xlen_t n = field_list.size(); bool nullable = true; cpp11::strings names(field_list.attr(R_NamesSymbol)); std::vector> fields(n); - for (int i = 0; i < n; i++) { + for (R_xlen_t i = 0; i < n; i++) { fields[i] = arrow::field( names[i], cpp11::as_cpp>(field_list[i]), nullable); diff --git a/r/src/table.cpp b/r/src/table.cpp index 04537000f5d48..04a8c7caf24fd 100644 --- a/r/src/table.cpp +++ b/r/src/table.cpp @@ -23,8 +23,8 @@ #include // [[arrow::export]] -r_vec_size Table__num_columns(const std::shared_ptr& x) { - return r_vec_size(x->num_columns()); +int Table__num_columns(const std::shared_ptr& x) { + return x->num_columns(); } // [[arrow::export]] @@ -49,14 +49,14 @@ std::shared_ptr Table__ReplaceSchemaMetadata( // [[arrow::export]] std::shared_ptr Table__column( - const std::shared_ptr& table, R_xlen_t i) { + const std::shared_ptr& table, int i) { arrow::r::validate_index(i, table->num_columns()); return table->column(i); } // [[arrow::export]] std::shared_ptr Table__field(const std::shared_ptr& table, - R_xlen_t i) { + int i) { arrow::r::validate_index(i, table->num_columns()); return table->field(i); } @@ -123,13 +123,13 @@ std::shared_ptr Table__GetColumnByName( // [[arrow::export]] std::shared_ptr Table__RemoveColumn( - const std::shared_ptr& table, R_xlen_t i) { + const std::shared_ptr& table, int i) { return ValueOrStop(table->RemoveColumn(i)); } // [[arrow::export]] std::shared_ptr Table__AddColumn( - const std::shared_ptr& table, R_xlen_t i, + const std::shared_ptr& table, int i, const std::shared_ptr& field, const std::shared_ptr& column) { return ValueOrStop(table->AddColumn(i, field, column)); @@ -137,7 +137,7 @@ std::shared_ptr Table__AddColumn( // [[arrow::export]] std::shared_ptr Table__SetColumn( - const std::shared_ptr& table, R_xlen_t i, + const std::shared_ptr& table, int i, const std::shared_ptr& field, const std::shared_ptr& column) { return ValueOrStop(table->SetColumn(i, field, column)); @@ -241,7 +241,7 @@ arrow::Status AddMetadataFromDots(SEXP lst, int num_fields, // Remove metadata for ExtensionType columns, because these have their own mechanism for // preserving R type information - for (R_xlen_t i = 0; i < schema->num_fields(); i++) { + for (int i = 0; i < schema->num_fields(); i++) { if (schema->field(i)->type()->id() == Type::EXTENSION) { metadata_columns[i] = R_NilValue; } From d51954415882423584f2a95b0897aa4d073a4e1c Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sat, 23 Dec 2023 15:03:47 +0000 Subject: [PATCH 097/570] GH-39320: [C++][FS][Azure] Add managed identity auth configuration (#39321) ### Rationale for this change Workload identity is a useful Azure authentication method. Also I failed to set the account_name correctly for a bunch of auths (I think this got lost in a rebase then I copy pasted the broken code). ### What changes are included in this PR? - Make filesystem initialisation fail if `account_name_.empty()`. This prevents the account name configuration bug we had. Also added a test asserting that filesystem initialization fails in this case. - Remove account name configuration on all auth configs, in favour of setting in separately from the auth configuration. - Implement `AzureOptions::ConfigureManagedIdentityCredential` ### Are these changes tested? Added a simple test initialising a filesystem using `ConfigureManagedIdentityCredential`. This is not the most comprehensive test but its the same as what we agreed on for https://github.com/apache/arrow/pull/39263. ### Are there any user-facing changes? Managed identity authentication is now supported. * Closes: #39320 Authored-by: Thomas Newton Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/filesystem/azurefs.cc | 38 ++++++++++++++++-------- cpp/src/arrow/filesystem/azurefs.h | 16 +++++----- cpp/src/arrow/filesystem/azurefs_test.cc | 34 +++++++++++++++++---- 3 files changed, 62 insertions(+), 26 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 26c2761886050..21350a490411a 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -58,7 +58,7 @@ bool AzureOptions::Equals(const AzureOptions& other) const { blob_storage_scheme == other.blob_storage_scheme && dfs_storage_scheme == other.dfs_storage_scheme && default_metadata == other.default_metadata && - account_name_ == other.account_name_ && + account_name == other.account_name && credential_kind_ == other.credential_kind_; if (!equals) { return false; @@ -104,17 +104,17 @@ std::string AzureOptions::AccountDfsUrl(const std::string& account_name) const { return BuildBaseUrl(dfs_storage_scheme, dfs_storage_authority, account_name); } -Status AzureOptions::ConfigureAccountKeyCredential(const std::string& account_name, - const std::string& account_key) { +Status AzureOptions::ConfigureAccountKeyCredential(const std::string& account_key) { credential_kind_ = CredentialKind::kStorageSharedKeyCredential; - account_name_ = account_name; + if (account_name.empty()) { + return Status::Invalid("AzureOptions doesn't contain a valid account name"); + } storage_shared_key_credential_ = std::make_shared(account_name, account_key); return Status::OK(); } -Status AzureOptions::ConfigureClientSecretCredential(const std::string& account_name, - const std::string& tenant_id, +Status AzureOptions::ConfigureClientSecretCredential(const std::string& tenant_id, const std::string& client_id, const std::string& client_secret) { credential_kind_ = CredentialKind::kTokenCredential; @@ -123,14 +123,20 @@ Status AzureOptions::ConfigureClientSecretCredential(const std::string& account_ return Status::OK(); } -Status AzureOptions::ConfigureDefaultCredential(const std::string& account_name) { +Status AzureOptions::ConfigureDefaultCredential() { credential_kind_ = CredentialKind::kTokenCredential; token_credential_ = std::make_shared(); return Status::OK(); } -Status AzureOptions::ConfigureWorkloadIdentityCredential( - const std::string& account_name) { +Status AzureOptions::ConfigureManagedIdentityCredential(const std::string& client_id) { + credential_kind_ = CredentialKind::kTokenCredential; + token_credential_ = + std::make_shared(client_id); + return Status::OK(); +} + +Status AzureOptions::ConfigureWorkloadIdentityCredential() { credential_kind_ = CredentialKind::kTokenCredential; token_credential_ = std::make_shared(); return Status::OK(); @@ -138,14 +144,17 @@ Status AzureOptions::ConfigureWorkloadIdentityCredential( Result> AzureOptions::MakeBlobServiceClient() const { + if (account_name.empty()) { + return Status::Invalid("AzureOptions doesn't contain a valid account name"); + } switch (credential_kind_) { case CredentialKind::kAnonymous: break; case CredentialKind::kTokenCredential: - return std::make_unique(AccountBlobUrl(account_name_), + return std::make_unique(AccountBlobUrl(account_name), token_credential_); case CredentialKind::kStorageSharedKeyCredential: - return std::make_unique(AccountBlobUrl(account_name_), + return std::make_unique(AccountBlobUrl(account_name), storage_shared_key_credential_); } return Status::Invalid("AzureOptions doesn't contain a valid auth configuration"); @@ -153,15 +162,18 @@ Result> AzureOptions::MakeBlobServiceC Result> AzureOptions::MakeDataLakeServiceClient() const { + if (account_name.empty()) { + return Status::Invalid("AzureOptions doesn't contain a valid account name"); + } switch (credential_kind_) { case CredentialKind::kAnonymous: break; case CredentialKind::kTokenCredential: return std::make_unique( - AccountDfsUrl(account_name_), token_credential_); + AccountDfsUrl(account_name), token_credential_); case CredentialKind::kStorageSharedKeyCredential: return std::make_unique( - AccountDfsUrl(account_name_), storage_shared_key_credential_); + AccountDfsUrl(account_name), storage_shared_key_credential_); } return Status::Invalid("AzureOptions doesn't contain a valid auth configuration"); } diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h index 346dd349e935c..78e0a8148c616 100644 --- a/cpp/src/arrow/filesystem/azurefs.h +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -48,6 +48,9 @@ class TestAzureFileSystem; /// Options for the AzureFileSystem implementation. struct ARROW_EXPORT AzureOptions { + /// \brief account name of the Azure Storage account. + std::string account_name; + /// \brief hostname[:port] of the Azure Blob Storage Service. /// /// If the hostname is a relative domain name (one that starts with a '.'), then storage @@ -94,7 +97,6 @@ struct ARROW_EXPORT AzureOptions { kStorageSharedKeyCredential, } credential_kind_ = CredentialKind::kAnonymous; - std::string account_name_; std::shared_ptr token_credential_; std::shared_ptr storage_shared_key_credential_; @@ -103,15 +105,15 @@ struct ARROW_EXPORT AzureOptions { AzureOptions(); ~AzureOptions(); - Status ConfigureDefaultCredential(const std::string& account_name); + Status ConfigureDefaultCredential(); + + Status ConfigureManagedIdentityCredential(const std::string& client_id = std::string()); - Status ConfigureWorkloadIdentityCredential(const std::string& account_name); + Status ConfigureWorkloadIdentityCredential(); - Status ConfigureAccountKeyCredential(const std::string& account_name, - const std::string& account_key); + Status ConfigureAccountKeyCredential(const std::string& account_key); - Status ConfigureClientSecretCredential(const std::string& account_name, - const std::string& tenant_id, + Status ConfigureClientSecretCredential(const std::string& tenant_id, const std::string& client_id, const std::string& client_secret); diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 62c5ef2232045..f6af9f722dbac 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -271,22 +271,44 @@ class AzureHierarchicalNSEnv : public AzureEnvImpl { bool WithHierarchicalNamespace() const final { return true; } }; +TEST(AzureFileSystem, InitializingFilesystemWithoutAccountNameFails) { + AzureOptions options; + ASSERT_RAISES(Invalid, options.ConfigureAccountKeyCredential("account_key")); + + ARROW_EXPECT_OK( + options.ConfigureClientSecretCredential("tenant_id", "client_id", "client_secret")); + ASSERT_RAISES(Invalid, AzureFileSystem::Make(options)); +} + TEST(AzureFileSystem, InitializeFilesystemWithClientSecretCredential) { AzureOptions options; - ARROW_EXPECT_OK(options.ConfigureClientSecretCredential( - "dummy-account-name", "tenant_id", "client_id", "client_secret")); + options.account_name = "dummy-account-name"; + ARROW_EXPECT_OK( + options.ConfigureClientSecretCredential("tenant_id", "client_id", "client_secret")); EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options)); } TEST(AzureFileSystem, InitializeFilesystemWithDefaultCredential) { AzureOptions options; - ARROW_EXPECT_OK(options.ConfigureDefaultCredential("dummy-account-name")); + options.account_name = "dummy-account-name"; + ARROW_EXPECT_OK(options.ConfigureDefaultCredential()); EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options)); } +TEST(AzureFileSystem, InitializeFilesystemWithManagedIdentityCredential) { + AzureOptions options; + options.account_name = "dummy-account-name"; + ARROW_EXPECT_OK(options.ConfigureManagedIdentityCredential()); + EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options)); + + ARROW_EXPECT_OK(options.ConfigureManagedIdentityCredential("specific-client-id")); + EXPECT_OK_AND_ASSIGN(fs, AzureFileSystem::Make(options)); +} + TEST(AzureFileSystem, InitializeFilesystemWithWorkloadIdentityCredential) { AzureOptions options; - ARROW_EXPECT_OK(options.ConfigureWorkloadIdentityCredential("dummy-account-name")); + options.account_name = "dummy-account-name"; + ARROW_EXPECT_OK(options.ConfigureWorkloadIdentityCredential()); EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options)); } @@ -383,6 +405,7 @@ class TestAzureFileSystem : public ::testing::Test { static Result MakeOptions(BaseAzureEnv* env) { AzureOptions options; + options.account_name = env->account_name(); switch (env->backend()) { case AzureBackend::kAzurite: options.blob_storage_authority = "127.0.0.1:10000"; @@ -394,8 +417,7 @@ class TestAzureFileSystem : public ::testing::Test { // Use the default values break; } - ARROW_EXPECT_OK( - options.ConfigureAccountKeyCredential(env->account_name(), env->account_key())); + ARROW_EXPECT_OK(options.ConfigureAccountKeyCredential(env->account_key())); return options; } From ec41209ea02bdb410bc7e049cb3100afedf4ba2f Mon Sep 17 00:00:00 2001 From: Jin Shang Date: Sat, 23 Dec 2023 23:50:39 +0800 Subject: [PATCH 098/570] GH-37055: [C++] Optimize hash kernels for Dictionary ChunkedArrays (#38394) ### Rationale for this change When merging dictionaries across chunks, the hash kernels unnecessarily unify the existing dictionary, dragging down the performance. ### What changes are included in this PR? Reuse the dictionary unifier across chunks. ### Are these changes tested? Yes, with a new benchmark for dictionary chunked arrays. ### Are there any user-facing changes? No. * Closes: #37055 Lead-authored-by: Jin Shang Co-authored-by: Felipe Oliveira Carvalho Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/compute/kernels/vector_hash.cc | 55 +++++++++++++------ .../compute/kernels/vector_hash_benchmark.cc | 36 ++++++++++++ 2 files changed, 74 insertions(+), 17 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc index 65e59d1a2eb14..800deba3a5ed2 100644 --- a/cpp/src/arrow/compute/kernels/vector_hash.cc +++ b/cpp/src/arrow/compute/kernels/vector_hash.cc @@ -26,17 +26,20 @@ #include "arrow/array/concatenate.h" #include "arrow/array/dict_internal.h" #include "arrow/array/util.h" +#include "arrow/buffer.h" #include "arrow/compute/api_vector.h" #include "arrow/compute/cast.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/result.h" #include "arrow/util/hashing.h" +#include "arrow/util/int_util.h" #include "arrow/util/unreachable.h" namespace arrow { using internal::DictionaryTraits; using internal::HashTraits; +using internal::TransposeInts; namespace compute { namespace internal { @@ -448,9 +451,9 @@ class DictionaryHashKernel : public HashKernel { Status Append(const ArraySpan& arr) override { auto arr_dict = arr.dictionary().ToArray(); - if (!dictionary_) { - dictionary_ = arr_dict; - } else if (!dictionary_->Equals(*arr_dict)) { + if (!first_dictionary_) { + first_dictionary_ = arr_dict; + } else if (!first_dictionary_->Equals(*arr_dict)) { // NOTE: This approach computes a new dictionary unification per chunk. // This is in effect O(n*k) where n is the total chunked array length and // k is the number of chunks (therefore O(n**2) if chunks have a fixed size). @@ -458,21 +461,23 @@ class DictionaryHashKernel : public HashKernel { // A better approach may be to run the kernel over each individual chunk, // and then hash-aggregate all results (for example sum-group-by for // the "value_counts" kernel). - auto out_dict_type = dictionary_->type(); + if (dictionary_unifier_ == nullptr) { + ARROW_ASSIGN_OR_RAISE(dictionary_unifier_, + DictionaryUnifier::Make(first_dictionary_->type())); + RETURN_NOT_OK(dictionary_unifier_->Unify(*first_dictionary_)); + } + auto out_dict_type = first_dictionary_->type(); std::shared_ptr transpose_map; - std::shared_ptr out_dict; - ARROW_ASSIGN_OR_RAISE(auto unifier, DictionaryUnifier::Make(out_dict_type)); - ARROW_CHECK_OK(unifier->Unify(*dictionary_)); - ARROW_CHECK_OK(unifier->Unify(*arr_dict, &transpose_map)); - ARROW_CHECK_OK(unifier->GetResult(&out_dict_type, &out_dict)); + RETURN_NOT_OK(dictionary_unifier_->Unify(*arr_dict, &transpose_map)); - dictionary_ = out_dict; auto transpose = reinterpret_cast(transpose_map->data()); - auto in_dict_array = arr.ToArray(); + auto in_array = arr.ToArray(); + const auto& in_dict_array = + arrow::internal::checked_cast(*in_array); ARROW_ASSIGN_OR_RAISE( - auto tmp, arrow::internal::checked_cast(*in_dict_array) - .Transpose(arr.type->GetSharedPtr(), out_dict, transpose)); + auto tmp, in_dict_array.Transpose(arr.type->GetSharedPtr(), + in_dict_array.dictionary(), transpose)); return indices_kernel_->Append(*tmp->data()); } @@ -495,12 +500,27 @@ class DictionaryHashKernel : public HashKernel { return dictionary_value_type_; } - std::shared_ptr dictionary() const { return dictionary_; } + /// This can't be called more than once because DictionaryUnifier::GetResult() + /// can't be called more than once and produce the same output. + Result> dictionary() const { + if (!first_dictionary_) { // Append was never called + return nullptr; + } + if (!dictionary_unifier_) { // Append was called only once + return first_dictionary_; + } + + auto out_dict_type = first_dictionary_->type(); + std::shared_ptr out_dict; + RETURN_NOT_OK(dictionary_unifier_->GetResult(&out_dict_type, &out_dict)); + return out_dict; + } private: std::unique_ptr indices_kernel_; - std::shared_ptr dictionary_; + std::shared_ptr first_dictionary_; std::shared_ptr dictionary_value_type_; + std::unique_ptr dictionary_unifier_; }; // ---------------------------------------------------------------------- @@ -630,8 +650,9 @@ Status ValueCountsFinalize(KernelContext* ctx, std::vector* out) { // hence have no dictionary. Result> EnsureHashDictionary(KernelContext* ctx, DictionaryHashKernel* hash) { - if (hash->dictionary()) { - return hash->dictionary()->data(); + ARROW_ASSIGN_OR_RAISE(auto dict, hash->dictionary()); + if (dict) { + return dict->data(); } ARROW_ASSIGN_OR_RAISE(auto null, MakeArrayOfNull(hash->dictionary_value_type(), /*length=*/0, ctx->memory_pool())); diff --git a/cpp/src/arrow/compute/kernels/vector_hash_benchmark.cc b/cpp/src/arrow/compute/kernels/vector_hash_benchmark.cc index e9548e133aa00..472f50db8cf92 100644 --- a/cpp/src/arrow/compute/kernels/vector_hash_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/vector_hash_benchmark.cc @@ -25,6 +25,7 @@ #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" #include "arrow/testing/util.h" +#include "arrow/util/logging.h" #include "arrow/compute/api.h" @@ -226,6 +227,33 @@ static void UniqueString100bytes(benchmark::State& state) { BenchUnique(state, HashParams{general_bench_cases[state.range(0)], 100}); } +template +void BenchValueCountsDictionaryChunks(benchmark::State& state, const ParamType& params) { + std::shared_ptr arr; + params.GenerateTestData(&arr); + // chunk arr to 100 slices + std::vector> chunks; + const int64_t chunk_size = arr->length() / 100; + for (int64_t i = 0; i < 100; ++i) { + auto slice = arr->Slice(i * chunk_size, chunk_size); + auto datum = DictionaryEncode(slice).ValueOrDie(); + ARROW_CHECK(datum.is_array()); + chunks.push_back(datum.make_array()); + } + auto chunked_array = std::make_shared(chunks); + + while (state.KeepRunning()) { + ABORT_NOT_OK(ValueCounts(chunked_array).status()); + } + params.SetMetadata(state); +} + +static void ValueCountsDictionaryChunks(benchmark::State& state) { + // Dictionary of byte strings with 10 bytes each + BenchValueCountsDictionaryChunks( + state, HashParams{general_bench_cases[state.range(0)], 10}); +} + void HashSetArgs(benchmark::internal::Benchmark* bench) { for (int i = 0; i < static_cast(general_bench_cases.size()); ++i) { bench->Arg(i); @@ -239,6 +267,14 @@ BENCHMARK(UniqueInt64)->Apply(HashSetArgs); BENCHMARK(UniqueString10bytes)->Apply(HashSetArgs); BENCHMARK(UniqueString100bytes)->Apply(HashSetArgs); +void DictionaryChunksHashSetArgs(benchmark::internal::Benchmark* bench) { + for (int i = 0; i < static_cast(general_bench_cases.size()); ++i) { + bench->Arg(i); + } +} + +BENCHMARK(ValueCountsDictionaryChunks)->Apply(DictionaryChunksHashSetArgs); + void UInt8SetArgs(benchmark::internal::Benchmark* bench) { for (int i = 0; i < static_cast(uint8_bench_cases.size()); ++i) { bench->Arg(i); From 90f7ecab559870dc862d34b5ac323c77c7050353 Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Mon, 25 Dec 2023 05:23:17 -0500 Subject: [PATCH 099/570] GH-39017: [JS] Add `typeId` as attribute (#39018) ### Rationale for this change Support reconstructing `DataType` after `postMessage`. ### What changes are included in this PR? Make `typeId` an attribute, not a getter. ### Are these changes tested? Passes all existing tests. ### Are there any user-facing changes? No * Closes: #39017 --- js/src/type.ts | 70 ++++++++++++++++++++------------------------------ 1 file changed, 28 insertions(+), 42 deletions(-) diff --git a/js/src/type.ts b/js/src/type.ts index dea5301aed355..ae3aefa025999 100644 --- a/js/src/type.ts +++ b/js/src/type.ts @@ -79,7 +79,11 @@ export abstract class DataTypeType.NONE; } + declare public readonly typeId: TType; + + constructor(typeId: TType) { + this.typeId = typeId; + } protected static [Symbol.toStringTag] = ((proto: DataType) => { (proto).children = null; @@ -93,8 +97,10 @@ export abstract class DataType { TArray: void; TValue: null } /** @ignore */ export class Null extends DataType { + constructor() { + super(Type.Null); + } public toString() { return `Null`; } - public get typeId() { return Type.Null as Type.Null; } protected static [Symbol.toStringTag] = ((proto: Null) => proto[Symbol.toStringTag] = 'Null')(Null.prototype); } @@ -119,9 +125,8 @@ interface Int_ extends DataType { TArray: IType[T]['TA class Int_ extends DataType { constructor(public readonly isSigned: IType[T]['isSigned'], public readonly bitWidth: IType[T]['bitWidth']) { - super(); + super(Type.Int as T); } - public get typeId() { return Type.Int as T; } public get ArrayType() { switch (this.bitWidth) { case 8: return this.isSigned ? Int8Array : Uint8Array; @@ -206,9 +211,8 @@ export interface Float extends DataType { TArray: /** @ignore */ export class Float extends DataType { constructor(public readonly precision: Precision) { - super(); + super(Type.Float as T); } - public get typeId() { return Type.Float as T; } public get ArrayType(): TypedArrayConstructor { switch (this.precision) { case Precision.HALF: return Uint16Array; @@ -241,9 +245,8 @@ export interface Binary extends DataType { TArray: Uint8Array; TOff /** @ignore */ export class Binary extends DataType { constructor() { - super(); + super(Type.Binary); } - public get typeId() { return Type.Binary as Type.Binary; } public toString() { return `Binary`; } protected static [Symbol.toStringTag] = ((proto: Binary) => { (proto).ArrayType = Uint8Array; @@ -256,9 +259,8 @@ export interface LargeBinary extends DataType { TArray: Uint8A /** @ignore */ export class LargeBinary extends DataType { constructor() { - super(); + super(Type.LargeBinary); } - public get typeId() { return Type.LargeBinary as Type.LargeBinary; } public toString() { return `LargeBinary`; } protected static [Symbol.toStringTag] = ((proto: LargeBinary) => { (proto).ArrayType = Uint8Array; @@ -272,9 +274,8 @@ export interface Utf8 extends DataType { TArray: Uint8Array; TOffsetA /** @ignore */ export class Utf8 extends DataType { constructor() { - super(); + super(Type.Utf8); } - public get typeId() { return Type.Utf8 as Type.Utf8; } public toString() { return `Utf8`; } protected static [Symbol.toStringTag] = ((proto: Utf8) => { (proto).ArrayType = Uint8Array; @@ -287,9 +288,8 @@ export interface LargeUtf8 extends DataType { TArray: Uint8Array /** @ignore */ export class LargeUtf8 extends DataType { constructor() { - super(); + super(Type.LargeUtf8); } - public get typeId() { return Type.LargeUtf8 as Type.LargeUtf8; } public toString() { return `LargeUtf8`; } protected static [Symbol.toStringTag] = ((proto: LargeUtf8) => { (proto).ArrayType = Uint8Array; @@ -303,9 +303,8 @@ export interface Bool extends DataType { TArray: Uint8Array; TValue: /** @ignore */ export class Bool extends DataType { constructor() { - super(); + super(Type.Bool); } - public get typeId() { return Type.Bool as Type.Bool; } public toString() { return `Bool`; } protected static [Symbol.toStringTag] = ((proto: Bool) => { (proto).ArrayType = Uint8Array; @@ -320,9 +319,8 @@ export class Decimal extends DataType { constructor(public readonly scale: number, public readonly precision: number, public readonly bitWidth: number = 128) { - super(); + super(Type.Decimal); } - public get typeId() { return Type.Decimal as Type.Decimal; } public toString() { return `Decimal[${this.precision}e${this.scale > 0 ? `+` : ``}${this.scale}]`; } protected static [Symbol.toStringTag] = ((proto: Decimal) => { (proto).scale = null; @@ -339,9 +337,8 @@ export interface Date_ extends DataType { TArray: In /** @ignore */ export class Date_ extends DataType { constructor(public readonly unit: DateUnit) { - super(); + super(Type.Date as T); } - public get typeId() { return Type.Date as T; } public toString() { return `Date${(this.unit + 1) * 32}<${DateUnit[this.unit]}>`; } protected static [Symbol.toStringTag] = ((proto: Date_) => { (proto).unit = null; @@ -375,9 +372,8 @@ interface Time_ extends DataType { class Time_ extends DataType { constructor(public readonly unit: TimesType[T]['unit'], public readonly bitWidth: TimeBitWidth) { - super(); + super(Type.Time as T); } - public get typeId() { return Type.Time as T; } public toString() { return `Time${this.bitWidth}<${TimeUnit[this.unit]}>`; } public get ArrayType() { switch (this.bitWidth) { @@ -418,9 +414,8 @@ interface Timestamp_ extends DataType { class Timestamp_ extends DataType { constructor(public readonly unit: TimeUnit, public readonly timezone?: string | null) { - super(); + super(Type.Timestamp as T); } - public get typeId() { return Type.Timestamp as T; } public toString() { return `Timestamp<${TimeUnit[this.unit]}${this.timezone ? `, ${this.timezone}` : ``}>`; } protected static [Symbol.toStringTag] = ((proto: Timestamp_) => { (proto).unit = null; @@ -453,9 +448,8 @@ interface Interval_ extends DataType { /** @ignore */ class Interval_ extends DataType { constructor(public readonly unit: IntervalUnit) { - super(); + super(Type.Interval as T); } - public get typeId() { return Type.Interval as T; } public toString() { return `Interval<${IntervalUnit[this.unit]}>`; } protected static [Symbol.toStringTag] = ((proto: Interval_) => { (proto).unit = null; @@ -483,9 +477,8 @@ export interface Duration extends DataType { /** @ignore */ export class Duration extends DataType { constructor(public readonly unit: TimeUnit) { - super(); + super(Type.Duration as T); } - public get typeId() { return Type.Duration as T; } public toString() { return `Duration<${TimeUnit[this.unit]}>`; } protected static [Symbol.toStringTag] = ((proto: Duration) => { (proto).unit = null; @@ -513,11 +506,10 @@ export interface List extends DataType extends DataType { constructor(child: Field) { - super(); + super(Type.List); this.children = [child]; } public declare readonly children: Field[]; - public get typeId() { return Type.List as Type.List; } public toString() { return `List<${this.valueType}>`; } public get valueType(): T { return this.children[0].type as T; } public get valueField(): Field { return this.children[0] as Field; } @@ -540,10 +532,9 @@ export class Struct extends DataType { public declare _row: StructRow; public declare readonly children: Field[]; constructor(children: Field[]) { - super(); + super(Type.Struct); this.children = children; } - public get typeId() { return Type.Struct as Type.Struct; } public toString() { return `Struct<{${this.children.map((f) => `${f.name}:${f.type}`).join(`, `)}}>`; } protected static [Symbol.toStringTag] = ((proto: Struct) => { (proto).children = null; @@ -564,13 +555,12 @@ class Union_ extends DataType { constructor(mode: UnionMode, typeIds: number[] | Int32Array, children: Field[]) { - super(); + super(Type.Union as T); this.mode = mode; this.children = children; this.typeIds = typeIds = Int32Array.from(typeIds); this.typeIdToChildIndex = typeIds.reduce((typeIdToChildIndex, typeId, idx) => (typeIdToChildIndex[typeId] = idx) && typeIdToChildIndex || typeIdToChildIndex, Object.create(null) as { [key: number]: number }); } - public get typeId() { return Type.Union as T; } public toString() { return `${this[Symbol.toStringTag]}<${this.children.map((x) => `${x.type}`).join(` | `) }>`; @@ -611,9 +601,8 @@ export interface FixedSizeBinary extends DataType { /** @ignore */ export class FixedSizeBinary extends DataType { constructor(public readonly byteWidth: number) { - super(); + super(Type.FixedSizeBinary); } - public get typeId() { return Type.FixedSizeBinary as Type.FixedSizeBinary; } public toString() { return `FixedSizeBinary[${this.byteWidth}]`; } protected static [Symbol.toStringTag] = ((proto: FixedSizeBinary) => { (proto).byteWidth = null; @@ -632,10 +621,9 @@ export interface FixedSizeList extends DataType extends DataType { public declare readonly children: Field[]; constructor(public readonly listSize: number, child: Field) { - super(); + super(Type.FixedSizeList); this.children = [child]; } - public get typeId() { return Type.FixedSizeList as Type.FixedSizeList; } public get valueType(): T { return this.children[0].type as T; } public get valueField(): Field { return this.children[0] as Field; } public get ArrayType(): T['ArrayType'] { return this.valueType.ArrayType; } @@ -657,7 +645,7 @@ export interface Map_ extends DataType }> { constructor(entries: Field>, keysSorted = false) { - super(); + super(Type.Map); this.children = [entries]; this.keysSorted = keysSorted; // ARROW-8716 @@ -678,7 +666,6 @@ export class Map_ ex } public declare readonly keysSorted: boolean; public declare readonly children: Field>[]; - public get typeId() { return Type.Map as Type.Map; } public get keyType(): TKey { return this.children[0].type.children[0].type as TKey; } public get valueType(): TValue { return this.children[0].type.children[1].type as TValue; } public get childType() { return this.children[0].type as Struct<{ key: TKey; value: TValue }>; } @@ -709,13 +696,12 @@ export class Dictionary ex public declare readonly dictionary: T; public declare readonly isOrdered: boolean; constructor(dictionary: T, indices: TKey, id?: bigint | number | null, isOrdered?: boolean | null) { - super(); + super(Type.Dictionary); this.indices = indices; this.dictionary = dictionary; this.isOrdered = isOrdered || false; this.id = id == null ? getId() : bigIntToNumber(id); } - public get typeId() { return Type.Dictionary as Type.Dictionary; } public get children() { return this.dictionary.children; } public get valueType(): T { return this.dictionary as T; } public get ArrayType(): T['ArrayType'] { return this.dictionary.ArrayType; } From 4d9a860196c2959c8595e117452ef5094ce7363c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 26 Dec 2023 11:09:10 +0900 Subject: [PATCH 100/570] MINOR: [C#] Bump xunit.runner.visualstudio from 2.5.5 to 2.5.6 in /csharp (#39369) Bumps [xunit.runner.visualstudio](https://github.com/xunit/visualstudio.xunit) from 2.5.5 to 2.5.6.
    Commits

    [![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=xunit.runner.visualstudio&package-manager=nuget&previous-version=2.5.5&new-version=2.5.6)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
    Dependabot commands and options
    You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
    Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- .../Apache.Arrow.Compression.Tests.csproj | 2 +- .../Apache.Arrow.Flight.Sql.Tests.csproj | 2 +- .../Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj | 2 +- csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj index 94ef4b5f3c5f5..e3d86f0dd9992 100644 --- a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj +++ b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj @@ -9,7 +9,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj index 46d0a59b5d8e1..4dd479545a74c 100644 --- a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj @@ -8,7 +8,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj index 99c772770d6c6..114e76ad984f1 100644 --- a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj @@ -8,7 +8,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj index fde30a90e6479..71f68fe2d49e3 100644 --- a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj +++ b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj @@ -16,7 +16,7 @@ - + all runtime; build; native; contentfiles; analyzers From 35db6f78a2e2b45e55109979c85649150d205326 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 26 Dec 2023 11:10:24 +0900 Subject: [PATCH 101/570] MINOR: [Java] Bump org.apache.maven.plugins:maven-surefire-plugin from 3.0.0-M7 to 3.2.3 in /java (#39372) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [org.apache.maven.plugins:maven-surefire-plugin](https://github.com/apache/maven-surefire) from 3.0.0-M7 to 3.2.3.
    Release notes

    Sourced from org.apache.maven.plugins:maven-surefire-plugin's releases.

    3.2.2

    🐛 Bug Fixes

    📦 Dependency updates

    🔧 Build

    3.2.1

    🚀 New features and improvements

    🐛 Bug Fixes

    📝 Documentation updates

    👻 Maintenance

    3.1.2

    Release Notes - Maven Surefire - Version 3.1.2

    ... (truncated)

    Commits
    • ac9e574 [maven-release-plugin] prepare release surefire-3.2.3
    • 2d6cbc6 [SUREFIRE-2220] SurefireForkChannel#getForkNodeConnectionString() returns inv...
    • 05322d9 [SUREFIRE-2212] OutOfMemoryError raised when parsing files with huge stderr/s...
    • 55ccd06 [SUREFIRE-2211] additionalClasspathElement with UNC path not working with Mav...
    • aa864f4 [SUREFIRE-2216] Upgrade plugins and components (in ITs)
    • 6662e07 [SUREFIRE-2215] Upgrade to Parent 41
    • f5b73ab [SUREFIRE-2214] Uprade to HtmlUnit 3.8.0
    • 47c5816 [SUREFIRE-2210] - Restore ordering of additional class path elements
    • 9b7ecf1 [maven-release-plugin] prepare for next development iteration
    • 2d76753 [maven-release-plugin] prepare release surefire-3.2.2
    • Additional commits viewable in compare view

    [![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=org.apache.maven.plugins:maven-surefire-plugin&package-manager=maven&previous-version=3.0.0-M7&new-version=3.2.3)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
    Dependabot commands and options
    You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
    Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- java/performance/pom.xml | 2 +- java/pom.xml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/java/performance/pom.xml b/java/performance/pom.xml index a3e4da85b4321..888c0fb367932 100644 --- a/java/performance/pom.xml +++ b/java/performance/pom.xml @@ -211,7 +211,7 @@ maven-surefire-plugin - 3.0.0-M7 + 3.2.3 diff --git a/java/pom.xml b/java/pom.xml index 4cca5e7245f0f..27d1504016ee6 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -442,7 +442,7 @@ maven-surefire-plugin - 3.0.0-M7 + 3.2.3 org.junit.jupiter From 9126021e675e7e021a11a90a7ab7d67bd6529712 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 26 Dec 2023 11:10:49 +0900 Subject: [PATCH 102/570] MINOR: [Java] Bump org.apache.maven.plugins:maven-resources-plugin from 2.6 to 3.3.1 in /java (#39373) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [org.apache.maven.plugins:maven-resources-plugin](https://github.com/apache/maven-resources-plugin) from 2.6 to 3.3.1.
    Release notes

    Sourced from org.apache.maven.plugins:maven-resources-plugin's releases.

    3.3.1

    🚨 Removed

    📦 Dependency updates

    📝 Documentation updates

    • doc: adds alternative variable syntax using @ delimiters to the documentation (#36) @​kevin0x90

    3.3.0

    📦 Dependency updates

    📝 Documentation updates

    👻 Maintenance

    3.2.0

    What's Changed

    New Contributors

    ... (truncated)

    Commits
    • 978ce1e [maven-release-plugin] prepare release maven-resources-plugin-3.3.1
    • b7cd080 [MRESOURCES-296] Upgrade to maven-filtering 3.3.1
    • 1c9f610 [MRESOURCES-288] Make tests-jar reproducible (#56)
    • 1946127 [MRESOURCES-293] Rollback
    • f7a6f22 [MRESOURCES-297] Update to parent POM 39, reformat (#55)
    • 22d64ca remove specific IDE m2e files (#40)
    • 02c2d01 [MRESOURCES-293] Make resources param not read-only (#54)
    • 6bb3e1f [MRESOURCES-295] Drop Plexus legacy code (#53)
    • df7e172 [MRESOURCES-294] Upgrade plexus-utils to 3.5.1
    • 9354ecd Bump apache/maven-gh-actions-shared from 2 to 3
    • Additional commits viewable in compare view

    [![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=org.apache.maven.plugins:maven-resources-plugin&package-manager=maven&previous-version=2.6&new-version=3.3.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
    Dependabot commands and options
    You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
    Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- java/performance/pom.xml | 2 +- java/pom.xml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/java/performance/pom.xml b/java/performance/pom.xml index 888c0fb367932..4d449af46b6b1 100644 --- a/java/performance/pom.xml +++ b/java/performance/pom.xml @@ -199,7 +199,7 @@
    maven-resources-plugin - 2.6 + 3.3.1 maven-site-plugin diff --git a/java/pom.xml b/java/pom.xml index 27d1504016ee6..1776407e3d030 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -412,7 +412,7 @@ org.apache.maven.plugins maven-resources-plugin - 2.6 + 3.3.1 org.apache.maven.plugins From 6bb77464940bf97dbd042bbf1c6048439f4c0695 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 26 Dec 2023 14:23:37 +0900 Subject: [PATCH 103/570] MINOR: [C#] Bump xunit from 2.6.3 to 2.6.4 in /csharp (#39370) Bumps [xunit](https://github.com/xunit/xunit) from 2.6.3 to 2.6.4.
    Commits

    [![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=xunit&package-manager=nuget&previous-version=2.6.3&new-version=2.6.4)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
    Dependabot commands and options
    You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
    Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- .../Apache.Arrow.Compression.Tests.csproj | 2 +- .../Apache.Arrow.Flight.Sql.Tests.csproj | 2 +- .../Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj | 2 +- csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj index e3d86f0dd9992..dd2c75dd3df90 100644 --- a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj +++ b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj @@ -8,7 +8,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj index 4dd479545a74c..0e9c02d61977c 100644 --- a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj @@ -7,7 +7,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj index 114e76ad984f1..d38413ba45b3a 100644 --- a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj @@ -7,7 +7,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj index 71f68fe2d49e3..0afd1490e7b69 100644 --- a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj +++ b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj @@ -15,7 +15,7 @@ - + all runtime; build; native; contentfiles; analyzers From 526b2eb298292849b133f9ddae7facdf8ee1d35f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 26 Dec 2023 14:24:09 +0900 Subject: [PATCH 104/570] MINOR: [Java] Bump org.assertj:assertj-core from 3.23.1 to 3.24.2 in /java (#39375) Bumps org.assertj:assertj-core from 3.23.1 to 3.24.2. [![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=org.assertj:assertj-core&package-manager=maven&previous-version=3.23.1&new-version=3.24.2)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
    Dependabot commands and options
    You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
    Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- java/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/pom.xml b/java/pom.xml index 1776407e3d030..523e5642720cd 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -609,7 +609,7 @@ org.assertj assertj-core - 3.23.1 + 3.24.2 test From b32f71a157eb90a7eb107c540b9cadd343e5e388 Mon Sep 17 00:00:00 2001 From: Hyunseok Seo Date: Tue, 26 Dec 2023 15:25:51 +0900 Subject: [PATCH 105/570] GH-39363: [C++] Use Cast() instead of CastTo() for Parquet (#39364) ### Rationale for this change Remove legacy code ### What changes are included in this PR? Replace the legacy scalar CastTo implementation for Parquet. ### Are these changes tested? Yes. It is passed by existing all test cases for Parquet. ### Are there any user-facing changes? Maybe, Yes. There is a dependency on the Parquet schema that the user handles. There may be a problem if the user has to deal with a type for which Casting is not implemented. However, in this case, it should be treated as a new issue with an implementation that improves the `Cast` compute kernel. * Closes: #39363 Authored-by: Hyunseok Seo Signed-off-by: mwish --- cpp/src/arrow/dataset/file_parquet.cc | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 3afe4ec85cf49..1c2fd2dea6307 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -24,6 +24,7 @@ #include #include +#include "arrow/compute/cast.h" #include "arrow/compute/exec.h" #include "arrow/dataset/dataset_internal.h" #include "arrow/dataset/parquet_encryption_config.h" @@ -58,6 +59,8 @@ using parquet::arrow::SchemaField; using parquet::arrow::SchemaManifest; using parquet::arrow::StatisticsAsScalars; +using compute::Cast; + namespace { parquet::ReaderProperties MakeReaderProperties( @@ -370,12 +373,12 @@ std::optional ParquetFileFragment::EvaluateStatisticsAsExpr return std::nullopt; } - auto maybe_min = min->CastTo(field.type()); - auto maybe_max = max->CastTo(field.type()); + auto maybe_min = Cast(min, field.type()); + auto maybe_max = Cast(max, field.type()); if (maybe_min.ok() && maybe_max.ok()) { - min = maybe_min.MoveValueUnsafe(); - max = maybe_max.MoveValueUnsafe(); + min = maybe_min.MoveValueUnsafe().scalar(); + max = maybe_max.MoveValueUnsafe().scalar(); if (min->Equals(*max)) { auto single_value = compute::equal(field_expr, compute::literal(std::move(min))); From cf44793204d88e0156669af102ff65f180a6b003 Mon Sep 17 00:00:00 2001 From: "Rossi(Ruoxi) Sun" Date: Tue, 26 Dec 2023 09:14:32 -0800 Subject: [PATCH 106/570] GH-39357: [C++] Reduce function.h includes (#39312) ### Rationale for this change As proposed in #36246 , by splitting function option structs from `function.h`, we can reduce the including of `function.h`. So that the total build time could be reduced. The total parser time could be reduced from 722.3s to 709.7s. And the `function.h` along with its transitive inclusion of `kernel.h` don't show up in expensive headers any more. The detailed analysis result before and after this PR are attached: [analyze-before.txt](https://github.com/apache/arrow/files/13756923/analyze-before.txt) [analyze-after.txt](https://github.com/apache/arrow/files/13756924/analyze-after.txt) Disclaimer (quote from https://github.com/apache/arrow/issues/36246#issuecomment-1866974963): > Note that the time diff is not absolute. The ClangBuildAnalyzer result differs from time to time. I guess it depends on the idle-ness of the building machine when doing the experiment. But the time reduction is almost certain, though sometimes more sometimes less. And the inclusion times of the questioning headers are reduced for sure, as shown in the attachments in my other comment. ### What changes are included in this PR? Move function option structs into own `compute/options.h`, and change including `function.h` to including `options.h` wherever fits. ### Are these changes tested? Build is testing. ### Are there any user-facing changes? There could be potential build failures for user code (quote from https://github.com/apache/arrow/issues/36246#issuecomment-1866980969): > The header function.h remains in compute/api.h, with and without this PR. The proposed PR removes function.h from api_xxx.h (then includes options.h instead), as proposed in the initial description of this issue. This results in compile failures for user code which includes only compute/api_xxx.h but not compute/api.h, and meanwhile uses CallFunction which is declared in function.h. But I think it's OK as described in https://github.com/apache/arrow/issues/36246#issuecomment-1867018578. * Closes: #39357 Authored-by: zanmato Signed-off-by: Felipe Oliveira Carvalho --- .../arrow/compute_and_write_csv_example.cc | 2 +- cpp/src/arrow/acero/aggregate_internal.cc | 1 + cpp/src/arrow/acero/scalar_aggregate_node.cc | 1 + cpp/src/arrow/compute/api.h | 21 +++-- cpp/src/arrow/compute/api_aggregate.h | 2 +- cpp/src/arrow/compute/api_scalar.h | 2 +- cpp/src/arrow/compute/api_vector.h | 3 +- cpp/src/arrow/compute/cast.h | 1 + cpp/src/arrow/compute/function.cc | 1 + cpp/src/arrow/compute/function.h | 46 +---------- cpp/src/arrow/compute/function_options.h | 81 +++++++++++++++++++ .../kernels/scalar_if_else_benchmark.cc | 1 + cpp/src/arrow/compute/kernels/vector_rank.cc | 1 + .../kernels/vector_replace_benchmark.cc | 1 + .../kernels/vector_run_end_encode_test.cc | 1 + .../arrow/compute/kernels/vector_select_k.cc | 1 + cpp/src/arrow/compute/kernels/vector_sort.cc | 1 + cpp/src/arrow/compute/registry_test.cc | 1 + cpp/src/arrow/compute/type_fwd.h | 1 + 19 files changed, 111 insertions(+), 58 deletions(-) create mode 100644 cpp/src/arrow/compute/function_options.h diff --git a/cpp/examples/arrow/compute_and_write_csv_example.cc b/cpp/examples/arrow/compute_and_write_csv_example.cc index edf21e45b2bb7..7e0f6cdf1ce16 100644 --- a/cpp/examples/arrow/compute_and_write_csv_example.cc +++ b/cpp/examples/arrow/compute_and_write_csv_example.cc @@ -16,7 +16,7 @@ // under the License. #include -#include +#include #include #include #include diff --git a/cpp/src/arrow/acero/aggregate_internal.cc b/cpp/src/arrow/acero/aggregate_internal.cc index 3cd5491720dcd..9c4b7fe5ae98c 100644 --- a/cpp/src/arrow/acero/aggregate_internal.cc +++ b/cpp/src/arrow/acero/aggregate_internal.cc @@ -25,6 +25,7 @@ #include "arrow/acero/exec_plan.h" #include "arrow/acero/options.h" #include "arrow/compute/exec.h" +#include "arrow/compute/function.h" #include "arrow/compute/registry.h" #include "arrow/compute/row/grouper.h" #include "arrow/datum.h" diff --git a/cpp/src/arrow/acero/scalar_aggregate_node.cc b/cpp/src/arrow/acero/scalar_aggregate_node.cc index ae59aa692096a..c7805f4d24eb2 100644 --- a/cpp/src/arrow/acero/scalar_aggregate_node.cc +++ b/cpp/src/arrow/acero/scalar_aggregate_node.cc @@ -25,6 +25,7 @@ #include "arrow/acero/options.h" #include "arrow/acero/util.h" #include "arrow/compute/exec.h" +#include "arrow/compute/function.h" #include "arrow/compute/registry.h" #include "arrow/compute/row/grouper.h" #include "arrow/datum.h" diff --git a/cpp/src/arrow/compute/api.h b/cpp/src/arrow/compute/api.h index 5b5dfdf69eb94..b701d9928691f 100644 --- a/cpp/src/arrow/compute/api.h +++ b/cpp/src/arrow/compute/api.h @@ -20,18 +20,23 @@ #pragma once +/// \defgroup compute-functions Abstract compute function API +/// @{ +/// @} + /// \defgroup compute-concrete-options Concrete option classes for compute functions /// @{ /// @} -#include "arrow/compute/api_aggregate.h" // IWYU pragma: export -#include "arrow/compute/api_scalar.h" // IWYU pragma: export -#include "arrow/compute/api_vector.h" // IWYU pragma: export -#include "arrow/compute/cast.h" // IWYU pragma: export -#include "arrow/compute/function.h" // IWYU pragma: export -#include "arrow/compute/kernel.h" // IWYU pragma: export -#include "arrow/compute/registry.h" // IWYU pragma: export -#include "arrow/datum.h" // IWYU pragma: export +#include "arrow/compute/api_aggregate.h" // IWYU pragma: export +#include "arrow/compute/api_scalar.h" // IWYU pragma: export +#include "arrow/compute/api_vector.h" // IWYU pragma: export +#include "arrow/compute/cast.h" // IWYU pragma: export +#include "arrow/compute/function.h" // IWYU pragma: export +#include "arrow/compute/function_options.h" // IWYU pragma: export +#include "arrow/compute/kernel.h" // IWYU pragma: export +#include "arrow/compute/registry.h" // IWYU pragma: export +#include "arrow/datum.h" // IWYU pragma: export #include "arrow/compute/expression.h" // IWYU pragma: export diff --git a/cpp/src/arrow/compute/api_aggregate.h b/cpp/src/arrow/compute/api_aggregate.h index 3493c3146310d..4d2c814a69bbb 100644 --- a/cpp/src/arrow/compute/api_aggregate.h +++ b/cpp/src/arrow/compute/api_aggregate.h @@ -22,7 +22,7 @@ #include -#include "arrow/compute/function.h" +#include "arrow/compute/function_options.h" #include "arrow/datum.h" #include "arrow/result.h" #include "arrow/util/macros.h" diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 9f12471ddca14..26fbe64f74293 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -24,7 +24,7 @@ #include #include -#include "arrow/compute/function.h" +#include "arrow/compute/function_options.h" #include "arrow/compute/type_fwd.h" #include "arrow/datum.h" #include "arrow/result.h" diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index 0233090ef6fb9..759f9e5c1a408 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -20,9 +20,8 @@ #include #include -#include "arrow/compute/function.h" +#include "arrow/compute/function_options.h" #include "arrow/compute/ordering.h" -#include "arrow/datum.h" #include "arrow/result.h" #include "arrow/type_fwd.h" diff --git a/cpp/src/arrow/compute/cast.h b/cpp/src/arrow/compute/cast.h index 613e8a55addd2..18e56092dda2a 100644 --- a/cpp/src/arrow/compute/cast.h +++ b/cpp/src/arrow/compute/cast.h @@ -22,6 +22,7 @@ #include #include "arrow/compute/function.h" +#include "arrow/compute/function_options.h" #include "arrow/compute/type_fwd.h" #include "arrow/result.h" #include "arrow/status.h" diff --git a/cpp/src/arrow/compute/function.cc b/cpp/src/arrow/compute/function.cc index c0433145dd1d0..e1a2e8c5d8879 100644 --- a/cpp/src/arrow/compute/function.cc +++ b/cpp/src/arrow/compute/function.cc @@ -26,6 +26,7 @@ #include "arrow/compute/exec.h" #include "arrow/compute/exec_internal.h" #include "arrow/compute/function_internal.h" +#include "arrow/compute/function_options.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/compute/registry.h" #include "arrow/datum.h" diff --git a/cpp/src/arrow/compute/function.h b/cpp/src/arrow/compute/function.h index 333c9a65c56c4..be934a3c5abfc 100644 --- a/cpp/src/arrow/compute/function.h +++ b/cpp/src/arrow/compute/function.h @@ -36,53 +36,9 @@ namespace arrow { namespace compute { -/// \defgroup compute-functions Abstract compute function API -/// +/// \addtogroup compute-functions /// @{ -/// \brief Extension point for defining options outside libarrow (but -/// still within this project). -class ARROW_EXPORT FunctionOptionsType { - public: - virtual ~FunctionOptionsType() = default; - - virtual const char* type_name() const = 0; - virtual std::string Stringify(const FunctionOptions&) const = 0; - virtual bool Compare(const FunctionOptions&, const FunctionOptions&) const = 0; - virtual Result> Serialize(const FunctionOptions&) const; - virtual Result> Deserialize( - const Buffer& buffer) const; - virtual std::unique_ptr Copy(const FunctionOptions&) const = 0; -}; - -/// \brief Base class for specifying options configuring a function's behavior, -/// such as error handling. -class ARROW_EXPORT FunctionOptions : public util::EqualityComparable { - public: - virtual ~FunctionOptions() = default; - - const FunctionOptionsType* options_type() const { return options_type_; } - const char* type_name() const { return options_type()->type_name(); } - - bool Equals(const FunctionOptions& other) const; - std::string ToString() const; - std::unique_ptr Copy() const; - /// \brief Serialize an options struct to a buffer. - Result> Serialize() const; - /// \brief Deserialize an options struct from a buffer. - /// Note: this will only look for `type_name` in the default FunctionRegistry; - /// to use a custom FunctionRegistry, look up the FunctionOptionsType, then - /// call FunctionOptionsType::Deserialize(). - static Result> Deserialize( - const std::string& type_name, const Buffer& buffer); - - protected: - explicit FunctionOptions(const FunctionOptionsType* type) : options_type_(type) {} - const FunctionOptionsType* options_type_; -}; - -ARROW_EXPORT void PrintTo(const FunctionOptions&, std::ostream*); - /// \brief Contains the number of required arguments for the function. /// /// Naming conventions taken from https://en.wikipedia.org/wiki/Arity. diff --git a/cpp/src/arrow/compute/function_options.h b/cpp/src/arrow/compute/function_options.h new file mode 100644 index 0000000000000..88ec2fd2d0679 --- /dev/null +++ b/cpp/src/arrow/compute/function_options.h @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// NOTE: API is EXPERIMENTAL and will change without going through a +// deprecation cycle. + +#pragma once + +#include "arrow/compute/type_fwd.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/type_fwd.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace compute { + +/// \addtogroup compute-functions +/// @{ + +/// \brief Extension point for defining options outside libarrow (but +/// still within this project). +class ARROW_EXPORT FunctionOptionsType { + public: + virtual ~FunctionOptionsType() = default; + + virtual const char* type_name() const = 0; + virtual std::string Stringify(const FunctionOptions&) const = 0; + virtual bool Compare(const FunctionOptions&, const FunctionOptions&) const = 0; + virtual Result> Serialize(const FunctionOptions&) const; + virtual Result> Deserialize( + const Buffer& buffer) const; + virtual std::unique_ptr Copy(const FunctionOptions&) const = 0; +}; + +/// \brief Base class for specifying options configuring a function's behavior, +/// such as error handling. +class ARROW_EXPORT FunctionOptions : public util::EqualityComparable { + public: + virtual ~FunctionOptions() = default; + + const FunctionOptionsType* options_type() const { return options_type_; } + const char* type_name() const { return options_type()->type_name(); } + + bool Equals(const FunctionOptions& other) const; + std::string ToString() const; + std::unique_ptr Copy() const; + /// \brief Serialize an options struct to a buffer. + Result> Serialize() const; + /// \brief Deserialize an options struct from a buffer. + /// Note: this will only look for `type_name` in the default FunctionRegistry; + /// to use a custom FunctionRegistry, look up the FunctionOptionsType, then + /// call FunctionOptionsType::Deserialize(). + static Result> Deserialize( + const std::string& type_name, const Buffer& buffer); + + protected: + explicit FunctionOptions(const FunctionOptionsType* type) : options_type_(type) {} + const FunctionOptionsType* options_type_; +}; + +ARROW_EXPORT void PrintTo(const FunctionOptions&, std::ostream*); + +/// @} + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc index b72402bbccd4e..58bc560f52842 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc @@ -21,6 +21,7 @@ #include "arrow/array/concatenate.h" #include "arrow/array/util.h" #include "arrow/compute/api_scalar.h" +#include "arrow/compute/function.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" #include "arrow/util/key_value_metadata.h" diff --git a/cpp/src/arrow/compute/kernels/vector_rank.cc b/cpp/src/arrow/compute/kernels/vector_rank.cc index 780ae25d96360..0cea7246e516c 100644 --- a/cpp/src/arrow/compute/kernels/vector_rank.cc +++ b/cpp/src/arrow/compute/kernels/vector_rank.cc @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +#include "arrow/compute/function.h" #include "arrow/compute/kernels/vector_sort_internal.h" #include "arrow/compute/registry.h" diff --git a/cpp/src/arrow/compute/kernels/vector_replace_benchmark.cc b/cpp/src/arrow/compute/kernels/vector_replace_benchmark.cc index 719969d46ea7c..971a841de0773 100644 --- a/cpp/src/arrow/compute/kernels/vector_replace_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/vector_replace_benchmark.cc @@ -18,6 +18,7 @@ #include #include "arrow/array.h" +#include "arrow/datum.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" diff --git a/cpp/src/arrow/compute/kernels/vector_run_end_encode_test.cc b/cpp/src/arrow/compute/kernels/vector_run_end_encode_test.cc index 0bd8e3386e7cc..f02aee1b35996 100644 --- a/cpp/src/arrow/compute/kernels/vector_run_end_encode_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_run_end_encode_test.cc @@ -21,6 +21,7 @@ #include "arrow/array/validate.h" #include "arrow/builder.h" #include "arrow/compute/api_vector.h" +#include "arrow/datum.h" #include "arrow/testing/gtest_util.h" #include "arrow/type_fwd.h" #include "arrow/util/logging.h" diff --git a/cpp/src/arrow/compute/kernels/vector_select_k.cc b/cpp/src/arrow/compute/kernels/vector_select_k.cc index 5000de8996280..1740a9b7f0bb4 100644 --- a/cpp/src/arrow/compute/kernels/vector_select_k.cc +++ b/cpp/src/arrow/compute/kernels/vector_select_k.cc @@ -17,6 +17,7 @@ #include +#include "arrow/compute/function.h" #include "arrow/compute/kernels/vector_sort_internal.h" #include "arrow/compute/registry.h" diff --git a/cpp/src/arrow/compute/kernels/vector_sort.cc b/cpp/src/arrow/compute/kernels/vector_sort.cc index 8ddcbb9905cb2..e08a2bc10372f 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort.cc +++ b/cpp/src/arrow/compute/kernels/vector_sort.cc @@ -17,6 +17,7 @@ #include +#include "arrow/compute/function.h" #include "arrow/compute/kernels/vector_sort_internal.h" #include "arrow/compute/registry.h" diff --git a/cpp/src/arrow/compute/registry_test.cc b/cpp/src/arrow/compute/registry_test.cc index 7fee136de7a0b..2d69f119df1f4 100644 --- a/cpp/src/arrow/compute/registry_test.cc +++ b/cpp/src/arrow/compute/registry_test.cc @@ -22,6 +22,7 @@ #include #include "arrow/compute/function.h" +#include "arrow/compute/function_options.h" #include "arrow/compute/registry.h" #include "arrow/result.h" #include "arrow/status.h" diff --git a/cpp/src/arrow/compute/type_fwd.h b/cpp/src/arrow/compute/type_fwd.h index 3f990b1814311..89f32ceb0f906 100644 --- a/cpp/src/arrow/compute/type_fwd.h +++ b/cpp/src/arrow/compute/type_fwd.h @@ -27,6 +27,7 @@ struct TypeHolder; namespace compute { class Function; +class ScalarAggregateFunction; class FunctionExecutor; class FunctionOptions; class FunctionRegistry; From ae627c09b08dbd9b4faac545170f4706645ca4ce Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Wed, 27 Dec 2023 15:06:23 +0100 Subject: [PATCH 107/570] GH-39251: [JS] Use resizable buffer in builder (#39252) --- js/src/builder.ts | 2 +- js/src/builder/binary.ts | 4 ++-- js/src/builder/buffer.ts | 44 +++++++++++++++++++++++++------------ js/src/builder/largeutf8.ts | 2 +- js/src/builder/union.ts | 4 ++-- js/src/builder/utf8.ts | 2 +- 6 files changed, 37 insertions(+), 21 deletions(-) diff --git a/js/src/builder.ts b/js/src/builder.ts index a4e2d4d89325c..1880db3818ca5 100644 --- a/js/src/builder.ts +++ b/js/src/builder.ts @@ -342,7 +342,7 @@ export abstract class Builder { export abstract class FixedWidthBuilder extends Builder { constructor(opts: BuilderOptions) { super(opts); - this._values = new DataBufferBuilder(new this.ArrayType(0), this.stride); + this._values = new DataBufferBuilder(this.ArrayType, 0, this.stride); } public setValue(index: number, value: T['TValue']) { const values = this._values; diff --git a/js/src/builder/binary.ts b/js/src/builder/binary.ts index 3c12ddf34abb0..fa9a11b24ec39 100644 --- a/js/src/builder/binary.ts +++ b/js/src/builder/binary.ts @@ -16,15 +16,15 @@ // under the License. import { Binary } from '../type.js'; -import { toUint8Array } from '../util/buffer.js'; import { BufferBuilder } from './buffer.js'; import { VariableWidthBuilder, BuilderOptions } from '../builder.js'; +import { toUint8Array } from '../util/buffer.js'; /** @ignore */ export class BinaryBuilder extends VariableWidthBuilder { constructor(opts: BuilderOptions) { super(opts); - this._values = new BufferBuilder(new Uint8Array(0)); + this._values = new BufferBuilder(Uint8Array); } public get byteLength(): number { let size = this._pendingLength + (this.length * 4); diff --git a/js/src/builder/buffer.ts b/js/src/builder/buffer.ts index 402172059682c..18c6dcda738b9 100644 --- a/js/src/builder/buffer.ts +++ b/js/src/builder/buffer.ts @@ -24,20 +24,36 @@ function roundLengthUpToNearest64Bytes(len: number, BPE: number) { const bytesMinus1 = Math.ceil(len) * BPE - 1; return ((bytesMinus1 - bytesMinus1 % 64 + 64) || 64) / BPE; } + /** @ignore */ -const sliceOrExtendArray = (arr: T, len = 0) => ( - arr.length >= len ? arr.subarray(0, len) : memcpy(new (arr.constructor as any)(len), arr, 0) -) as T; +function resizeArray(arr: T, len = 0): T { + // TODO: remove when https://github.com/microsoft/TypeScript/issues/54636 is fixed + const buffer = arr.buffer as ArrayBufferLike & { resizable: boolean; resize: (byteLength: number) => void; maxByteLength: number }; + const byteLength = len * arr.BYTES_PER_ELEMENT; + if (buffer.resizable && byteLength <= buffer.maxByteLength) { + buffer.resize(byteLength); + return arr; + } + + // Fallback for non-resizable buffers + return arr.length >= len ? + arr.subarray(0, len) as T : + memcpy(new (arr.constructor as any)(len), arr, 0); +} + +/** @ignore */ +export const SAFE_ARRAY_SIZE = 2 ** 32 - 1; /** @ignore */ export class BufferBuilder { - constructor(buffer: T, stride = 1) { - this.buffer = buffer; + constructor(bufferType: ArrayCtor, initialSize = 0, stride = 1) { + this.length = Math.ceil(initialSize / stride); + // TODO: remove as any when https://github.com/microsoft/TypeScript/issues/54636 is fixed + this.buffer = new bufferType(new (ArrayBuffer as any)(this.length * bufferType.BYTES_PER_ELEMENT, { maxByteLength: SAFE_ARRAY_SIZE })) as T; this.stride = stride; - this.BYTES_PER_ELEMENT = buffer.BYTES_PER_ELEMENT; - this.ArrayType = buffer.constructor as ArrayCtor; - this._resize(this.length = Math.ceil(buffer.length / stride)); + this.BYTES_PER_ELEMENT = bufferType.BYTES_PER_ELEMENT; + this.ArrayType = bufferType; } public buffer: T; @@ -72,17 +88,18 @@ export class BufferBuilder { } public flush(length = this.length) { length = roundLengthUpToNearest64Bytes(length * this.stride, this.BYTES_PER_ELEMENT); - const array = sliceOrExtendArray(this.buffer, length); + const array = resizeArray(this.buffer, length); this.clear(); return array; } public clear() { this.length = 0; - this._resize(0); + // TODO: remove as any when https://github.com/microsoft/TypeScript/issues/54636 is fixed + this.buffer = new this.ArrayType(new (ArrayBuffer as any)(0, { maxByteLength: SAFE_ARRAY_SIZE })) as T; return this; } protected _resize(newLength: number) { - return this.buffer = memcpy(new this.ArrayType(newLength), this.buffer); + return this.buffer = resizeArray(this.buffer, newLength); } } @@ -100,7 +117,7 @@ export class DataBufferBuilder extends Buffe /** @ignore */ export class BitmapBufferBuilder extends DataBufferBuilder { - constructor(data = new Uint8Array(0)) { super(data, 1 / 8); } + constructor() { super(Uint8Array, 0, 1 / 8); } public numValid = 0; public get numInvalid() { return this.length - this.numValid; } @@ -123,9 +140,8 @@ export class BitmapBufferBuilder extends DataBufferBuilder { /** @ignore */ export class OffsetsBufferBuilder extends DataBufferBuilder { constructor(type: T) { - super(new type.OffsetArrayType(1), 1); + super(type.OffsetArrayType as ArrayCtor, 1, 1); } - public append(value: T['TOffsetArray'][0]) { return this.set(this.length - 1, value); } diff --git a/js/src/builder/largeutf8.ts b/js/src/builder/largeutf8.ts index 51890100095c1..90a0bde9f3443 100644 --- a/js/src/builder/largeutf8.ts +++ b/js/src/builder/largeutf8.ts @@ -25,7 +25,7 @@ import { LargeBinaryBuilder } from './largebinary.js'; export class LargeUtf8Builder extends VariableWidthBuilder { constructor(opts: BuilderOptions) { super(opts); - this._values = new BufferBuilder(new Uint8Array(0)); + this._values = new BufferBuilder(Uint8Array); } public get byteLength(): number { let size = this._pendingLength + (this.length * 4); diff --git a/js/src/builder/union.ts b/js/src/builder/union.ts index ac8a13191a549..7bee460a77de1 100644 --- a/js/src/builder/union.ts +++ b/js/src/builder/union.ts @@ -31,7 +31,7 @@ export abstract class UnionBuilder extends Builder constructor(options: UnionBuilderOptions) { super(options); - this._typeIds = new DataBufferBuilder(new Int8Array(0), 1); + this._typeIds = new DataBufferBuilder(Int8Array, 0, 1); if (typeof options['valueToChildTypeId'] === 'function') { this._valueToChildTypeId = options['valueToChildTypeId']; } @@ -84,7 +84,7 @@ export class DenseUnionBuilder extends UnionB constructor(options: UnionBuilderOptions) { super(options); - this._offsets = new DataBufferBuilder(new Int32Array(0)); + this._offsets = new DataBufferBuilder(Int32Array); } /** @ignore */ diff --git a/js/src/builder/utf8.ts b/js/src/builder/utf8.ts index 53b8306cbaffd..aac0aec54fe90 100644 --- a/js/src/builder/utf8.ts +++ b/js/src/builder/utf8.ts @@ -25,7 +25,7 @@ import { VariableWidthBuilder, BuilderOptions } from '../builder.js'; export class Utf8Builder extends VariableWidthBuilder { constructor(opts: BuilderOptions) { super(opts); - this._values = new BufferBuilder(new Uint8Array(0)); + this._values = new BufferBuilder(Uint8Array); } public get byteLength(): number { let size = this._pendingLength + (this.length * 4); From 9e33d12f1b022c902cc831026ceb3e0016ca4b3c Mon Sep 17 00:00:00 2001 From: Curt Hagenlocher Date: Wed, 27 Dec 2023 10:10:46 -0800 Subject: [PATCH 108/570] GH-39341: [C#] Support Utf8View, BinaryView and ListView (#39342) ### What changes are included in this PR? Support for reading, writing and representing Utf8View, BinaryView and ListView. ### Are these changes tested? Yes ### Are there any user-facing changes? New classes and APIs for Utf8View, BinaryView and ListView. * Closes: #39341 Authored-by: Curt Hagenlocher Signed-off-by: Curt Hagenlocher --- .../Arrays/ArrayDataConcatenator.cs | 102 +++++- .../Arrays/ArrowArrayBuilderFactory.cs | 6 + .../Apache.Arrow/Arrays/ArrowArrayFactory.cs | 6 + .../Apache.Arrow/Arrays/BinaryViewArray.cs | 344 ++++++++++++++++++ .../src/Apache.Arrow/Arrays/ListViewArray.cs | 217 +++++++++++ .../Apache.Arrow/Arrays/StringViewArray.cs | 110 ++++++ .../src/Apache.Arrow/C/CArrowArrayExporter.cs | 22 +- .../src/Apache.Arrow/C/CArrowArrayImporter.cs | 48 +++ .../Apache.Arrow/C/CArrowSchemaExporter.cs | 3 + .../Apache.Arrow/C/CArrowSchemaImporter.cs | 6 +- .../Extensions/ArrayDataExtensions.cs | 11 + .../Extensions/FlatbufExtensions.cs | 19 - csharp/src/Apache.Arrow/Flatbuf/BinaryView.cs | 47 +++ .../Flatbuf/Enums/MetadataVersion.cs | 12 +- csharp/src/Apache.Arrow/Flatbuf/Enums/Type.cs | 16 + csharp/src/Apache.Arrow/Flatbuf/Field.cs | 4 + .../src/Apache.Arrow/Flatbuf/LargeListView.cs | 42 +++ csharp/src/Apache.Arrow/Flatbuf/ListView.cs | 43 +++ .../src/Apache.Arrow/Flatbuf/RecordBatch.cs | 37 +- .../src/Apache.Arrow/Flatbuf/SparseTensor.cs | 4 + csharp/src/Apache.Arrow/Flatbuf/Tensor.cs | 4 + csharp/src/Apache.Arrow/Flatbuf/Utf8View.cs | 47 +++ .../Ipc/ArrowReaderImplementation.cs | 79 ++-- .../src/Apache.Arrow/Ipc/ArrowStreamWriter.cs | 57 ++- .../Ipc/ArrowTypeFlatbufferBuilder.cs | 39 ++ .../src/Apache.Arrow/Ipc/MessageSerializer.cs | 10 + csharp/src/Apache.Arrow/Scalars/BinaryView.cs | 111 ++++++ .../src/Apache.Arrow/Types/BinaryViewType.cs | 28 ++ csharp/src/Apache.Arrow/Types/IArrowType.cs | 3 + csharp/src/Apache.Arrow/Types/ListViewType.cs | 35 ++ .../src/Apache.Arrow/Types/StringViewType.cs | 28 ++ .../ArrowWriterBenchmark.cs | 2 +- .../Apache.Arrow.IntegrationTest/JsonFile.cs | 156 +++++++- .../Properties/launchSettings.json | 8 + .../ArrowArrayConcatenatorTests.cs | 89 +++++ .../Apache.Arrow.Tests/ArrowReaderVerifier.cs | 61 ++++ .../Apache.Arrow.Tests/BinaryViewTests.cs | 89 +++++ .../CDataInterfacePythonTests.cs | 4 +- csharp/test/Apache.Arrow.Tests/TableTests.cs | 6 +- csharp/test/Apache.Arrow.Tests/TestData.cs | 198 ++++++++-- dev/archery/archery/integration/datagen.py | 3 +- docs/source/status.rst | 10 +- 42 files changed, 2017 insertions(+), 149 deletions(-) create mode 100644 csharp/src/Apache.Arrow/Arrays/BinaryViewArray.cs create mode 100644 csharp/src/Apache.Arrow/Arrays/ListViewArray.cs create mode 100644 csharp/src/Apache.Arrow/Arrays/StringViewArray.cs create mode 100644 csharp/src/Apache.Arrow/Flatbuf/BinaryView.cs create mode 100644 csharp/src/Apache.Arrow/Flatbuf/LargeListView.cs create mode 100644 csharp/src/Apache.Arrow/Flatbuf/ListView.cs create mode 100644 csharp/src/Apache.Arrow/Flatbuf/Utf8View.cs create mode 100644 csharp/src/Apache.Arrow/Scalars/BinaryView.cs create mode 100644 csharp/src/Apache.Arrow/Types/BinaryViewType.cs create mode 100644 csharp/src/Apache.Arrow/Types/ListViewType.cs create mode 100644 csharp/src/Apache.Arrow/Types/StringViewType.cs create mode 100644 csharp/test/Apache.Arrow.IntegrationTest/Properties/launchSettings.json create mode 100644 csharp/test/Apache.Arrow.Tests/BinaryViewTests.cs diff --git a/csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs b/csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs index 463ca49e29c94..698d74e4bac84 100644 --- a/csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs +++ b/csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs @@ -14,6 +14,7 @@ // limitations under the License. using Apache.Arrow.Memory; +using Apache.Arrow.Scalars; using Apache.Arrow.Types; using System; using System.Collections.Generic; @@ -46,8 +47,11 @@ private class ArrayDataConcatenationVisitor : IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, @@ -84,17 +88,50 @@ public void Visit(FixedWidthType type) { CheckData(type, 2); ArrowBuffer validityBuffer = ConcatenateValidityBuffer(); - ArrowBuffer valueBuffer = ConcatenateFixedWidthTypeValueBuffer(type); + ArrowBuffer valueBuffer = ConcatenateFixedWidthTypeValueBuffer(1, type); Result = new ArrayData(type, _totalLength, _totalNullCount, 0, new ArrowBuffer[] { validityBuffer, valueBuffer }); } public void Visit(BinaryType type) => ConcatenateVariableBinaryArrayData(type); + public void Visit(BinaryViewType type) => ConcatenateBinaryViewArrayData(type); + public void Visit(StringType type) => ConcatenateVariableBinaryArrayData(type); + public void Visit(StringViewType type) => ConcatenateBinaryViewArrayData(type); + public void Visit(ListType type) => ConcatenateLists(type); + public void Visit(ListViewType type) + { + CheckData(type, 3); + ArrowBuffer validityBuffer = ConcatenateValidityBuffer(); + + var offsetsBuilder = new ArrowBuffer.Builder(_totalLength); + int baseOffset = 0; + + foreach (ArrayData arrayData in _arrayDataList) + { + if (arrayData.Length > 0) + { + ReadOnlySpan span = arrayData.Buffers[1].Span.CastTo().Slice(0, arrayData.Length); + foreach (int offset in span) + { + offsetsBuilder.Append(baseOffset + offset); + } + } + + baseOffset += arrayData.Children[0].Length; + } + + ArrowBuffer offsetBuffer = offsetsBuilder.Build(_allocator); + ArrowBuffer sizesBuffer = ConcatenateFixedWidthTypeValueBuffer(2, Int32Type.Default); + ArrayData child = Concatenate(SelectChildren(0), _allocator); + + Result = new ArrayData(type, _totalLength, _totalNullCount, 0, new ArrowBuffer[] { validityBuffer, offsetBuffer, sizesBuffer }, new[] { child }); + } + public void Visit(FixedSizeListType type) { CheckData(type, 1); @@ -161,6 +198,15 @@ private void CheckData(IArrowType type, int expectedBufferCount) } } + private void CheckDataVariadicCount(IArrowType type, int expectedBufferCount) + { + foreach (ArrayData arrayData in _arrayDataList) + { + arrayData.EnsureDataType(type.TypeId); + arrayData.EnsureVariadicBufferCount(expectedBufferCount); + } + } + private void ConcatenateVariableBinaryArrayData(IArrowType type) { CheckData(type, 3); @@ -171,6 +217,26 @@ private void ConcatenateVariableBinaryArrayData(IArrowType type) Result = new ArrayData(type, _totalLength, _totalNullCount, 0, new ArrowBuffer[] { validityBuffer, offsetBuffer, valueBuffer }); } + private void ConcatenateBinaryViewArrayData(IArrowType type) + { + CheckDataVariadicCount(type, 2); + ArrowBuffer validityBuffer = ConcatenateValidityBuffer(); + ArrowBuffer viewBuffer = ConcatenateViewBuffer(out int variadicBufferCount); + ArrowBuffer[] buffers = new ArrowBuffer[2 + variadicBufferCount]; + buffers[0] = validityBuffer; + buffers[1] = viewBuffer; + int index = 2; + foreach (ArrayData arrayData in _arrayDataList) + { + for (int i = 2; i < arrayData.Buffers.Length; i++) + { + buffers[index++] = arrayData.Buffers[i]; + } + } + + Result = new ArrayData(type, _totalLength, _totalNullCount, 0, buffers); + } + private void ConcatenateLists(NestedType type) { CheckData(type, 2); @@ -206,7 +272,7 @@ private ArrowBuffer ConcatenateBitmapBuffer(int bufferIndex) return builder.Build(_allocator); } - private ArrowBuffer ConcatenateFixedWidthTypeValueBuffer(FixedWidthType type) + private ArrowBuffer ConcatenateFixedWidthTypeValueBuffer(int bufferIndex, FixedWidthType type) { int typeByteWidth = type.BitWidth / 8; var builder = new ArrowBuffer.Builder(_totalLength * typeByteWidth); @@ -216,7 +282,7 @@ private ArrowBuffer ConcatenateFixedWidthTypeValueBuffer(FixedWidthType type) int length = arrayData.Length; int byteLength = length * typeByteWidth; - builder.Append(arrayData.Buffers[1].Span.Slice(0, byteLength)); + builder.Append(arrayData.Buffers[bufferIndex].Span.Slice(0, byteLength)); } return builder.Build(_allocator); @@ -265,6 +331,36 @@ private ArrowBuffer ConcatenateOffsetBuffer() return builder.Build(_allocator); } + private ArrowBuffer ConcatenateViewBuffer(out int variadicBufferCount) + { + var builder = new ArrowBuffer.Builder(_totalLength); + variadicBufferCount = 0; + foreach (ArrayData arrayData in _arrayDataList) + { + if (arrayData.Length == 0) + { + continue; + } + + ReadOnlySpan span = arrayData.Buffers[1].Span.CastTo().Slice(0, arrayData.Length); + foreach (BinaryView view in span) + { + if (view.Length > BinaryView.MaxInlineLength) + { + builder.Append(view.AdjustBufferIndex(variadicBufferCount)); + } + else + { + builder.Append(view); + } + } + + variadicBufferCount += (arrayData.Buffers.Length - 2); + } + + return builder.Build(_allocator); + } + private ArrowBuffer ConcatenateUnionTypeBuffer() { var builder = new ArrowBuffer.Builder(_totalLength); diff --git a/csharp/src/Apache.Arrow/Arrays/ArrowArrayBuilderFactory.cs b/csharp/src/Apache.Arrow/Arrays/ArrowArrayBuilderFactory.cs index af5a524798396..f8367102082f5 100644 --- a/csharp/src/Apache.Arrow/Arrays/ArrowArrayBuilderFactory.cs +++ b/csharp/src/Apache.Arrow/Arrays/ArrowArrayBuilderFactory.cs @@ -54,8 +54,12 @@ internal static IArrowArrayBuilder> return new DoubleArray.Builder(); case ArrowTypeId.String: return new StringArray.Builder(); + case ArrowTypeId.StringView: + return new StringViewArray.Builder(); case ArrowTypeId.Binary: return new BinaryArray.Builder(); + case ArrowTypeId.BinaryView: + return new BinaryViewArray.Builder(); case ArrowTypeId.Timestamp: return new TimestampArray.Builder(); case ArrowTypeId.Date64: @@ -70,6 +74,8 @@ internal static IArrowArrayBuilder> return new DurationArray.Builder(dataType as DurationType); case ArrowTypeId.List: return new ListArray.Builder(dataType as ListType); + case ArrowTypeId.ListView: + return new ListViewArray.Builder(dataType as ListViewType); case ArrowTypeId.FixedSizeList: return new FixedSizeListArray.Builder(dataType as FixedSizeListType); case ArrowTypeId.Decimal128: diff --git a/csharp/src/Apache.Arrow/Arrays/ArrowArrayFactory.cs b/csharp/src/Apache.Arrow/Arrays/ArrowArrayFactory.cs index d6577260bb82d..3d2ab1d2129f1 100644 --- a/csharp/src/Apache.Arrow/Arrays/ArrowArrayFactory.cs +++ b/csharp/src/Apache.Arrow/Arrays/ArrowArrayFactory.cs @@ -51,14 +51,20 @@ public static IArrowArray BuildArray(ArrayData data) return new DoubleArray(data); case ArrowTypeId.String: return new StringArray(data); + case ArrowTypeId.StringView: + return new StringViewArray(data); case ArrowTypeId.FixedSizedBinary: return new FixedSizeBinaryArray(data); case ArrowTypeId.Binary: return new BinaryArray(data); + case ArrowTypeId.BinaryView: + return new BinaryViewArray(data); case ArrowTypeId.Timestamp: return new TimestampArray(data); case ArrowTypeId.List: return new ListArray(data); + case ArrowTypeId.ListView: + return new ListViewArray(data); case ArrowTypeId.Map: return new MapArray(data); case ArrowTypeId.Struct: diff --git a/csharp/src/Apache.Arrow/Arrays/BinaryViewArray.cs b/csharp/src/Apache.Arrow/Arrays/BinaryViewArray.cs new file mode 100644 index 0000000000000..4f62dffd1ddeb --- /dev/null +++ b/csharp/src/Apache.Arrow/Arrays/BinaryViewArray.cs @@ -0,0 +1,344 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using Apache.Arrow.Memory; +using Apache.Arrow.Scalars; +using Apache.Arrow.Types; +using System; +using System.Collections.Generic; +using System.Runtime.CompilerServices; +using System.Collections; + +namespace Apache.Arrow +{ + public class BinaryViewArray : Array, IReadOnlyList + { + public class Builder : BuilderBase + { + public Builder() : base(BinaryViewType.Default) { } + public Builder(IArrowType dataType) : base(dataType) { } + + protected override BinaryViewArray Build(ArrayData data) + { + return new BinaryViewArray(data); + } + } + + public BinaryViewArray(ArrayData data) + : base(data) + { + data.EnsureDataType(ArrowTypeId.BinaryView); + data.EnsureVariadicBufferCount(2); + } + + public BinaryViewArray(ArrowTypeId typeId, ArrayData data) + : base(data) + { + data.EnsureDataType(typeId); + data.EnsureVariadicBufferCount(2); + } + + public abstract class BuilderBase : IArrowArrayBuilder + where TArray : IArrowArray + where TBuilder : class, IArrowArrayBuilder + { + protected IArrowType DataType { get; } + protected TBuilder Instance => this as TBuilder; + protected ArrowBuffer.Builder BinaryViews { get; } + protected ArrowBuffer.Builder ValueBuffer { get; } + protected ArrowBuffer.BitmapBuilder ValidityBuffer { get; } + protected int NullCount => this.ValidityBuffer.UnsetBitCount; + + protected BuilderBase(IArrowType dataType) + { + DataType = dataType; + BinaryViews = new ArrowBuffer.Builder(); + ValueBuffer = new ArrowBuffer.Builder(); + ValidityBuffer = new ArrowBuffer.BitmapBuilder(); + } + + protected abstract TArray Build(ArrayData data); + + /// + /// Gets the length of the array built so far. + /// + public int Length => BinaryViews.Length; + + /// + /// Build an Arrow array from the appended contents so far. + /// + /// Optional memory allocator. + /// Returns an array of type . + public TArray Build(MemoryAllocator allocator = default) + { + bool hasValues = ValueBuffer.Length > 0; + var bufs = new ArrowBuffer[hasValues ? 3 : 2]; + bufs[0] = NullCount > 0 ? ValidityBuffer.Build(allocator) : ArrowBuffer.Empty; + bufs[1] = BinaryViews.Build(allocator); + if (hasValues) { bufs[2] = ValueBuffer.Build(allocator); } + + var data = new ArrayData( + DataType, + length: Length, + NullCount, + offset: 0, + bufs); + + return Build(data); + } + + /// + /// Append a single null value to the array. + /// + /// Returns the builder (for fluent-style composition). + public TBuilder AppendNull() + { + // Do not add to the value buffer in the case of a null. + // Note that we do not need to increment the offset as a result. + ValidityBuffer.Append(false); + BinaryViews.Append(default(BinaryView)); + return Instance; + } + + /// + /// Appends a value, consisting of a single byte, to the array. + /// + /// Byte value to append. + /// Returns the builder (for fluent-style composition). + public TBuilder Append(byte value) + { + ValidityBuffer.Append(true); + Span buf = stackalloc[] { value }; + BinaryViews.Append(new BinaryView(buf)); + return Instance; + } + + /// + /// Append a value, consisting of a span of bytes, to the array. + /// + /// + /// Note that a single value is added, which consists of arbitrarily many bytes. If multiple values are + /// to be added, use the method. + /// + /// Span of bytes to add. + /// Returns the builder (for fluent-style composition). + public TBuilder Append(ReadOnlySpan span) + { + if (span.Length > BinaryView.MaxInlineLength) + { + int offset = ValueBuffer.Length; + ValueBuffer.Append(span); + BinaryViews.Append(new BinaryView(span.Length, span.Slice(0, 4), 0, offset)); + } + else + { + BinaryViews.Append(new BinaryView(span)); + } + ValidityBuffer.Append(true); + return Instance; + } + + /// + /// Append an enumerable collection of single-byte values to the array. + /// + /// + /// Note that this method appends multiple values, each of which is a single byte + /// + /// Single-byte values to add. + /// Returns the builder (for fluent-style composition). + public TBuilder AppendRange(IEnumerable values) + { + if (values == null) + { + throw new ArgumentNullException(nameof(values)); + } + + foreach (byte b in values) + { + Append(b); + } + + return Instance; + } + + /// + /// Append an enumerable collection of values to the array. + /// + /// Values to add. + /// Returns the builder (for fluent-style composition). + public TBuilder AppendRange(IEnumerable values) + { + if (values == null) + { + throw new ArgumentNullException(nameof(values)); + } + + foreach (byte[] arr in values) + { + if (arr == null) + { + AppendNull(); + } + else + { + Append((ReadOnlySpan)arr); + } + } + + return Instance; + } + + public TBuilder Reserve(int capacity) + { + // TODO: [ARROW-9366] Reserve capacity in the value buffer in a more sensible way. + BinaryViews.Reserve(capacity); + ValueBuffer.Reserve(capacity); + ValidityBuffer.Reserve(capacity); + return Instance; + } + + public TBuilder Resize(int length) + { + // TODO: [ARROW-9366] Resize the value buffer to a safe length based on offsets, not `length`. + BinaryViews.Resize(length); + ValueBuffer.Resize(length); + ValidityBuffer.Resize(length); + return Instance; + } + + public TBuilder Swap(int i, int j) + { + ValidityBuffer.Swap(i, j); + BinaryView view = BinaryViews.Span[i]; + BinaryViews.Span[i] = BinaryViews.Span[j]; + BinaryViews.Span[j] = view; + return Instance; + } + + public TBuilder Set(int index, byte value) + { + // TODO: Implement + throw new NotImplementedException(); + } + + /// + /// Clear all contents appended so far. + /// + /// Returns the builder (for fluent-style composition). + public TBuilder Clear() + { + BinaryViews.Clear(); + ValueBuffer.Clear(); + ValidityBuffer.Clear(); + return Instance; + } + } + + public BinaryViewArray(IArrowType dataType, int length, + ArrowBuffer binaryViewsBuffer, + ArrowBuffer dataBuffer, + ArrowBuffer nullBitmapBuffer, + int nullCount = 0, int offset = 0) + : this(new ArrayData(dataType, length, nullCount, offset, + new[] { nullBitmapBuffer, binaryViewsBuffer, dataBuffer })) + { } + + public override void Accept(IArrowArrayVisitor visitor) => Accept(this, visitor); + + public ArrowBuffer ViewsBuffer => Data.Buffers[1]; + + public int DataBufferCount => Data.Buffers.Length - 2; + + public ArrowBuffer DataBuffer(int index) => Data.Buffers[index + 2]; + + public ReadOnlySpan Views => ViewsBuffer.Span.CastTo().Slice(Offset, Length); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public int GetValueLength(int index) + { + if (index < 0 || index >= Length) + { + throw new ArgumentOutOfRangeException(nameof(index)); + } + if (!IsValid(index)) + { + return 0; + } + + return Views[index].Length; + } + + /// + /// Get the collection of bytes, as a read-only span, at a given index in the array. + /// + /// + /// Note that this method cannot reliably identify null values, which are indistinguishable from empty byte + /// collection values when seen in the context of this method's return type of . + /// Use the method or the overload instead + /// to reliably determine null values. + /// + /// Index at which to get bytes. + /// Returns a object. + /// If the index is negative or beyond the length of the array. + /// + public ReadOnlySpan GetBytes(int index) => GetBytes(index, out _); + + /// + /// Get the collection of bytes, as a read-only span, at a given index in the array. + /// + /// Index at which to get bytes. + /// Set to if the value at the given index is null. + /// Returns a object. + /// If the index is negative or beyond the length of the array. + /// + public ReadOnlySpan GetBytes(int index, out bool isNull) + { + if (index < 0 || index >= Length) + { + throw new ArgumentOutOfRangeException(nameof(index)); + } + + isNull = IsNull(index); + + if (isNull) + { + // Note that `return null;` is valid syntax, but would be misleading as `null` in the context of a span + // is actually returned as an empty span. + return ReadOnlySpan.Empty; + } + + BinaryView binaryView = Views[index]; + if (binaryView.IsInline) + { + return ViewsBuffer.Span.Slice(16 * index + 4, binaryView.Length); + } + + return DataBuffer(binaryView._bufferIndex).Span.Slice(binaryView._bufferOffset, binaryView.Length); + } + + int IReadOnlyCollection.Count => Length; + byte[] IReadOnlyList.this[int index] => GetBytes(index).ToArray(); + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + yield return GetBytes(index).ToArray(); + } + } + + IEnumerator IEnumerable.GetEnumerator() => ((IEnumerable)this).GetEnumerator(); + } +} diff --git a/csharp/src/Apache.Arrow/Arrays/ListViewArray.cs b/csharp/src/Apache.Arrow/Arrays/ListViewArray.cs new file mode 100644 index 0000000000000..081385d9211a4 --- /dev/null +++ b/csharp/src/Apache.Arrow/Arrays/ListViewArray.cs @@ -0,0 +1,217 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using Apache.Arrow.Memory; +using Apache.Arrow.Types; + +namespace Apache.Arrow +{ + public class ListViewArray : Array + { + public class Builder : IArrowArrayBuilder + { + public IArrowArrayBuilder> ValueBuilder { get; } + + public int Length => ValueOffsetsBufferBuilder.Length; + + private ArrowBuffer.Builder ValueOffsetsBufferBuilder { get; } + + private ArrowBuffer.Builder SizesBufferBuilder { get; } + + private ArrowBuffer.BitmapBuilder ValidityBufferBuilder { get; } + + public int NullCount { get; protected set; } + + private IArrowType DataType { get; } + + private int Start { get; set; } + + public Builder(IArrowType valueDataType) : this(new ListViewType(valueDataType)) + { + } + + public Builder(Field valueField) : this(new ListViewType(valueField)) + { + } + + internal Builder(ListViewType dataType) + { + ValueBuilder = ArrowArrayBuilderFactory.Build(dataType.ValueDataType); + ValueOffsetsBufferBuilder = new ArrowBuffer.Builder(); + SizesBufferBuilder = new ArrowBuffer.Builder(); + ValidityBufferBuilder = new ArrowBuffer.BitmapBuilder(); + DataType = dataType; + Start = -1; + } + + /// + /// Start a new variable-length list slot + /// + /// This function should be called before beginning to append elements to the + /// value builder. TODO: Consider adding builder APIs to support construction + /// of overlapping lists. + /// + public Builder Append() + { + AppendPrevious(); + + ValidityBufferBuilder.Append(true); + + return this; + } + + public Builder AppendNull() + { + AppendPrevious(); + + ValidityBufferBuilder.Append(false); + ValueOffsetsBufferBuilder.Append(Start); + SizesBufferBuilder.Append(0); + NullCount++; + Start = -1; + + return this; + } + + private void AppendPrevious() + { + if (Start >= 0) + { + ValueOffsetsBufferBuilder.Append(Start); + SizesBufferBuilder.Append(ValueBuilder.Length - Start); + } + Start = ValueBuilder.Length; + } + + public ListViewArray Build(MemoryAllocator allocator = default) + { + AppendPrevious(); + + ArrowBuffer validityBuffer = NullCount > 0 + ? ValidityBufferBuilder.Build(allocator) + : ArrowBuffer.Empty; + + return new ListViewArray(DataType, Length, + ValueOffsetsBufferBuilder.Build(allocator), SizesBufferBuilder.Build(allocator), + ValueBuilder.Build(allocator), + validityBuffer, NullCount, 0); + } + + public Builder Reserve(int capacity) + { + ValueOffsetsBufferBuilder.Reserve(capacity); + SizesBufferBuilder.Reserve(capacity); + ValidityBufferBuilder.Reserve(capacity); + return this; + } + + public Builder Resize(int length) + { + ValueOffsetsBufferBuilder.Resize(length); + SizesBufferBuilder.Resize(length); + ValidityBufferBuilder.Resize(length); + return this; + } + + public Builder Clear() + { + ValueOffsetsBufferBuilder.Clear(); + SizesBufferBuilder.Clear(); + ValueBuilder.Clear(); + ValidityBufferBuilder.Clear(); + return this; + } + + } + + public IArrowArray Values { get; } + + public ArrowBuffer ValueOffsetsBuffer => Data.Buffers[1]; + + public ReadOnlySpan ValueOffsets => ValueOffsetsBuffer.Span.CastTo().Slice(Offset, Length); + + public ArrowBuffer SizesBuffer => Data.Buffers[2]; + + public ReadOnlySpan Sizes => SizesBuffer.Span.CastTo().Slice(Offset, Length); + + public ListViewArray(IArrowType dataType, int length, + ArrowBuffer valueOffsetsBuffer, ArrowBuffer sizesBuffer, IArrowArray values, + ArrowBuffer nullBitmapBuffer, int nullCount = 0, int offset = 0) + : this(new ArrayData(dataType, length, nullCount, offset, + new[] { nullBitmapBuffer, valueOffsetsBuffer, sizesBuffer }, new[] { values.Data }), + values) + { + } + + public ListViewArray(ArrayData data) + : this(data, ArrowArrayFactory.BuildArray(data.Children[0])) + { + } + + private ListViewArray(ArrayData data, IArrowArray values) : base(data) + { + data.EnsureBufferCount(3); + data.EnsureDataType(ArrowTypeId.ListView); + Values = values; + } + + public override void Accept(IArrowArrayVisitor visitor) => Accept(this, visitor); + + public int GetValueLength(int index) + { + if (index < 0 || index >= Length) + { + throw new ArgumentOutOfRangeException(nameof(index)); + } + + if (IsNull(index)) + { + return 0; + } + + return Sizes[index]; + } + + public IArrowArray GetSlicedValues(int index) + { + if (index < 0 || index >= Length) + { + throw new ArgumentOutOfRangeException(nameof(index)); + } + + if (IsNull(index)) + { + return null; + } + + if (!(Values is Array array)) + { + return default; + } + + return array.Slice(ValueOffsets[index], GetValueLength(index)); + } + + protected override void Dispose(bool disposing) + { + if (disposing) + { + Values?.Dispose(); + } + base.Dispose(disposing); + } + } +} diff --git a/csharp/src/Apache.Arrow/Arrays/StringViewArray.cs b/csharp/src/Apache.Arrow/Arrays/StringViewArray.cs new file mode 100644 index 0000000000000..88644761535d9 --- /dev/null +++ b/csharp/src/Apache.Arrow/Arrays/StringViewArray.cs @@ -0,0 +1,110 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using Apache.Arrow.Types; +using System; +using System.Collections; +using System.Collections.Generic; +using System.Runtime.InteropServices; +using System.Text; + +namespace Apache.Arrow +{ + public class StringViewArray: BinaryViewArray, IReadOnlyList + { + public static readonly Encoding DefaultEncoding = Encoding.UTF8; + + public new class Builder : BuilderBase + { + public Builder() : base(StringViewType.Default) { } + + protected override StringViewArray Build(ArrayData data) + { + return new StringViewArray(data); + } + + public Builder Append(string value, Encoding encoding = null) + { + if (value == null) + { + return AppendNull(); + } + encoding = encoding ?? DefaultEncoding; + byte[] span = encoding.GetBytes(value); + return Append(span.AsSpan()); + } + + public Builder AppendRange(IEnumerable values, Encoding encoding = null) + { + foreach (string value in values) + { + Append(value, encoding); + } + + return this; + } + } + + public StringViewArray(ArrayData data) + : base(ArrowTypeId.StringView, data) { } + + public StringViewArray(int length, + ArrowBuffer valueOffsetsBuffer, + ArrowBuffer dataBuffer, + ArrowBuffer nullBitmapBuffer, + int nullCount = 0, int offset = 0) + : this(new ArrayData(StringViewType.Default, length, nullCount, offset, + new[] { nullBitmapBuffer, valueOffsetsBuffer, dataBuffer })) + { } + + public override void Accept(IArrowArrayVisitor visitor) => Accept(this, visitor); + + public string GetString(int index, Encoding encoding = default) + { + encoding ??= DefaultEncoding; + + ReadOnlySpan bytes = GetBytes(index, out bool isNull); + + if (isNull) + { + return null; + } + if (bytes.Length == 0) + { + return string.Empty; + } + + unsafe + { + fixed (byte* data = &MemoryMarshal.GetReference(bytes)) + return encoding.GetString(data, bytes.Length); + } + } + + int IReadOnlyCollection.Count => Length; + + string IReadOnlyList.this[int index] => GetString(index); + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + yield return GetString(index); + }; + } + + IEnumerator IEnumerable.GetEnumerator() => ((IEnumerable)this).GetEnumerator(); + } +} diff --git a/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs b/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs index 2d9febea33f54..03059eaf5d4df 100644 --- a/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs +++ b/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs @@ -15,10 +15,12 @@ using System; +using System.Buffers; using System.Diagnostics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using Apache.Arrow.Memory; +using Apache.Arrow.Types; namespace Apache.Arrow.C { @@ -121,7 +123,16 @@ private unsafe static void ConvertArray(ExportedAllocationOwner sharedOwner, Arr cArray->buffers = null; if (cArray->n_buffers > 0) { - cArray->buffers = (byte**)sharedOwner.Allocate(array.Buffers.Length * IntPtr.Size); + long* lengths = null; + int bufferCount = array.Buffers.Length; + if (array.DataType.TypeId == ArrowTypeId.BinaryView || array.DataType.TypeId == ArrowTypeId.StringView) + { + lengths = (long*)sharedOwner.Allocate(8 * bufferCount); // overallocation to avoid edge case + bufferCount++; + cArray->n_buffers++; + } + + cArray->buffers = (byte**)sharedOwner.Allocate(bufferCount * IntPtr.Size); for (int i = 0; i < array.Buffers.Length; i++) { ArrowBuffer buffer = array.Buffers[i]; @@ -131,6 +142,15 @@ private unsafe static void ConvertArray(ExportedAllocationOwner sharedOwner, Arr throw new NotSupportedException($"An ArrowArray of type {array.DataType.TypeId} could not be exported: failed on buffer #{i}"); } cArray->buffers[i] = (byte*)ptr; + if (lengths != null && i >= 2) + { + lengths[i - 2] = array.Buffers[i].Length; + } + } + + if (lengths != null) + { + cArray->buffers[array.Buffers.Length] = (byte*)lengths; } } diff --git a/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs b/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs index 1b40ec49658bb..fbb2be661fc5d 100644 --- a/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs +++ b/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs @@ -157,10 +157,18 @@ private ArrayData GetAsArrayData(CArrowArray* cArray, IArrowType type) case ArrowTypeId.Binary: buffers = ImportByteArrayBuffers(cArray); break; + case ArrowTypeId.StringView: + case ArrowTypeId.BinaryView: + buffers = ImportByteArrayViewBuffers(cArray); + break; case ArrowTypeId.List: children = ProcessListChildren(cArray, ((ListType)type).ValueDataType); buffers = ImportListBuffers(cArray); break; + case ArrowTypeId.ListView: + children = ProcessListChildren(cArray, ((ListViewType)type).ValueDataType); + buffers = ImportListViewBuffers(cArray); + break; case ArrowTypeId.FixedSizeList: children = ProcessListChildren(cArray, ((FixedSizeListType)type).ValueDataType); buffers = ImportFixedSizeListBuffers(cArray); @@ -268,6 +276,28 @@ private ArrowBuffer[] ImportByteArrayBuffers(CArrowArray* cArray) return buffers; } + private ArrowBuffer[] ImportByteArrayViewBuffers(CArrowArray* cArray) + { + if (cArray->n_buffers < 3) + { + throw new InvalidOperationException("Byte array views are expected to have at least three buffers"); + } + + int length = checked((int)cArray->length); + int viewsLength = length * 16; + + long* bufferLengths = (long*)cArray->buffers[cArray->n_buffers - 1]; + ArrowBuffer[] buffers = new ArrowBuffer[cArray->n_buffers - 1]; + buffers[0] = ImportValidityBuffer(cArray); + buffers[1] = new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[1], 0, viewsLength)); + for (int i = 2; i < buffers.Length; i++) + { + buffers[i] = new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[i], 0, checked((int)bufferLengths[i - 2]))); + } + + return buffers; + } + private ArrowBuffer[] ImportListBuffers(CArrowArray* cArray) { if (cArray->n_buffers != 2) @@ -285,6 +315,24 @@ private ArrowBuffer[] ImportListBuffers(CArrowArray* cArray) return buffers; } + private ArrowBuffer[] ImportListViewBuffers(CArrowArray* cArray) + { + if (cArray->n_buffers != 3) + { + throw new InvalidOperationException("List view arrays are expected to have exactly three buffers"); + } + + int length = checked((int)cArray->length); + int offsetsLength = length * 4; + + ArrowBuffer[] buffers = new ArrowBuffer[3]; + buffers[0] = ImportValidityBuffer(cArray); + buffers[1] = new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[1], 0, offsetsLength)); + buffers[2] = new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[2], 0, offsetsLength)); + + return buffers; + } + private ArrowBuffer[] ImportFixedSizeListBuffers(CArrowArray* cArray) { if (cArray->n_buffers != 1) diff --git a/csharp/src/Apache.Arrow/C/CArrowSchemaExporter.cs b/csharp/src/Apache.Arrow/C/CArrowSchemaExporter.cs index c9b45a8eb2d87..3bb7134af3ba9 100644 --- a/csharp/src/Apache.Arrow/C/CArrowSchemaExporter.cs +++ b/csharp/src/Apache.Arrow/C/CArrowSchemaExporter.cs @@ -167,7 +167,9 @@ private static string GetFormat(IArrowType datatype) return $"d:{decimalType.Precision},{decimalType.Scale},256"; // Binary case BinaryType _: return "z"; + case BinaryViewType _: return "vz"; case StringType _: return "u"; + case StringViewType _: return "vu"; case FixedSizeBinaryType binaryType: return $"w:{binaryType.ByteWidth}"; // Date @@ -196,6 +198,7 @@ private static string GetFormat(IArrowType datatype) }; // Nested case ListType _: return "+l"; + case ListViewType _: return "+vl"; case FixedSizeListType fixedListType: return $"+w:{fixedListType.ListSize}"; case StructType _: return "+s"; diff --git a/csharp/src/Apache.Arrow/C/CArrowSchemaImporter.cs b/csharp/src/Apache.Arrow/C/CArrowSchemaImporter.cs index 9c81195771bae..f1acc007bcef7 100644 --- a/csharp/src/Apache.Arrow/C/CArrowSchemaImporter.cs +++ b/csharp/src/Apache.Arrow/C/CArrowSchemaImporter.cs @@ -165,7 +165,7 @@ public ArrowType GetAsType() } // Special handling for nested types - if (format == "+l") + if (format == "+l" || format == "+vl") { if (_cSchema->n_children != 1) { @@ -180,7 +180,7 @@ public ArrowType GetAsType() Field childField = childSchema.GetAsField(); - return new ListType(childField); + return format[1] == 'v' ? new ListViewType(childField) : new ListType(childField); } else if (format == "+s") { @@ -303,8 +303,10 @@ public ArrowType GetAsType() "g" => DoubleType.Default, // Binary data "z" => BinaryType.Default, + "vz" => BinaryViewType.Default, //"Z" => new LargeBinaryType() // Not yet implemented "u" => StringType.Default, + "vu" => StringViewType.Default, //"U" => new LargeStringType(), // Not yet implemented // Date and time "tdD" => Date32Type.Default, diff --git a/csharp/src/Apache.Arrow/Extensions/ArrayDataExtensions.cs b/csharp/src/Apache.Arrow/Extensions/ArrayDataExtensions.cs index 399d9bf5e6bf1..2b6742a3d0cb2 100644 --- a/csharp/src/Apache.Arrow/Extensions/ArrayDataExtensions.cs +++ b/csharp/src/Apache.Arrow/Extensions/ArrayDataExtensions.cs @@ -23,6 +23,17 @@ internal static class ArrayDataExtensions public static void EnsureBufferCount(this ArrayData data, int count) { if (data.Buffers.Length != count) + { + // TODO: Use localizable string resource + throw new ArgumentException( + $"Buffer count <{data.Buffers.Length}> must be at exactly <{count}>", + nameof(data.Buffers.Length)); + } + } + + public static void EnsureVariadicBufferCount(this ArrayData data, int count) + { + if (data.Buffers.Length < count) { // TODO: Use localizable string resource throw new ArgumentException( diff --git a/csharp/src/Apache.Arrow/Extensions/FlatbufExtensions.cs b/csharp/src/Apache.Arrow/Extensions/FlatbufExtensions.cs index 5f39680b90ebc..b44c02d854077 100644 --- a/csharp/src/Apache.Arrow/Extensions/FlatbufExtensions.cs +++ b/csharp/src/Apache.Arrow/Extensions/FlatbufExtensions.cs @@ -19,25 +19,6 @@ namespace Apache.Arrow { internal static class FlatbufExtensions { - public static bool IsFixedPrimitive(this Flatbuf.Type t) - { - if (t == Flatbuf.Type.Utf8 || t == Flatbuf.Type.Binary) - return false; - return true; - } - - public static bool IsFixedPrimitive(this Types.IArrowType t) - { - return t.TypeId.IsFixedPrimitive(); - } - - public static bool IsFixedPrimitive(this Types.ArrowTypeId t) - { - if (t == Types.ArrowTypeId.String || t == Types.ArrowTypeId.Binary) - return false; - return true; - } - public static Types.IntervalUnit ToArrow(this Flatbuf.IntervalUnit unit) { switch (unit) diff --git a/csharp/src/Apache.Arrow/Flatbuf/BinaryView.cs b/csharp/src/Apache.Arrow/Flatbuf/BinaryView.cs new file mode 100644 index 0000000000000..2f9cca51737f8 --- /dev/null +++ b/csharp/src/Apache.Arrow/Flatbuf/BinaryView.cs @@ -0,0 +1,47 @@ +// +// automatically generated by the FlatBuffers compiler, do not modify +// + +namespace Apache.Arrow.Flatbuf +{ + +using global::System; +using global::System.Collections.Generic; +using global::Google.FlatBuffers; + +/// Logically the same as Binary, but the internal representation uses a view +/// struct that contains the string length and either the string's entire data +/// inline (for small strings) or an inlined prefix, an index of another buffer, +/// and an offset pointing to a slice in that buffer (for non-small strings). +/// +/// Since it uses a variable number of data buffers, each Field with this type +/// must have a corresponding entry in `variadicBufferCounts`. +internal struct BinaryView : IFlatbufferObject +{ + private Table __p; + public ByteBuffer ByteBuffer { get { return __p.bb; } } + public static void ValidateVersion() { FlatBufferConstants.FLATBUFFERS_23_5_9(); } + public static BinaryView GetRootAsBinaryView(ByteBuffer _bb) { return GetRootAsBinaryView(_bb, new BinaryView()); } + public static BinaryView GetRootAsBinaryView(ByteBuffer _bb, BinaryView obj) { return (obj.__assign(_bb.GetInt(_bb.Position) + _bb.Position, _bb)); } + public void __init(int _i, ByteBuffer _bb) { __p = new Table(_i, _bb); } + public BinaryView __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; } + + + public static void StartBinaryView(FlatBufferBuilder builder) { builder.StartTable(0); } + public static Offset EndBinaryView(FlatBufferBuilder builder) { + int o = builder.EndTable(); + return new Offset(o); + } +} + + +static internal class BinaryViewVerify +{ + static public bool Verify(Google.FlatBuffers.Verifier verifier, uint tablePos) + { + return verifier.VerifyTableStart(tablePos) + && verifier.VerifyTableEnd(tablePos); + } +} + +} diff --git a/csharp/src/Apache.Arrow/Flatbuf/Enums/MetadataVersion.cs b/csharp/src/Apache.Arrow/Flatbuf/Enums/MetadataVersion.cs index 1e893e8cb6ffc..13b5315805dc9 100644 --- a/csharp/src/Apache.Arrow/Flatbuf/Enums/MetadataVersion.cs +++ b/csharp/src/Apache.Arrow/Flatbuf/Enums/MetadataVersion.cs @@ -8,21 +8,21 @@ namespace Apache.Arrow.Flatbuf internal enum MetadataVersion : short { /// 0.1.0 (October 2016). - V1 = 0, + V1 = 0, /// 0.2.0 (February 2017). Non-backwards compatible with V1. - V2 = 1, + V2 = 1, /// 0.3.0 -> 0.7.1 (May - December 2017). Non-backwards compatible with V2. - V3 = 2, + V3 = 2, /// >= 0.8.0 (December 2017). Non-backwards compatible with V3. - V4 = 3, - /// >= 1.0.0 (July 2020. Backwards compatible with V4 (V5 readers can read V4 + V4 = 3, + /// >= 1.0.0 (July 2020). Backwards compatible with V4 (V5 readers can read V4 /// metadata and IPC messages). Implementations are recommended to provide a /// V4 compatibility mode with V5 format changes disabled. /// /// Incompatible changes between V4 and V5: /// - Union buffer layout has changed. In V5, Unions don't have a validity /// bitmap buffer. - V5 = 4, + V5 = 4, }; diff --git a/csharp/src/Apache.Arrow/Flatbuf/Enums/Type.cs b/csharp/src/Apache.Arrow/Flatbuf/Enums/Type.cs index 10f852efb9b96..9c04288648dea 100644 --- a/csharp/src/Apache.Arrow/Flatbuf/Enums/Type.cs +++ b/csharp/src/Apache.Arrow/Flatbuf/Enums/Type.cs @@ -33,6 +33,10 @@ internal enum Type : byte LargeUtf8 = 20, LargeList = 21, RunEndEncoded = 22, + BinaryView = 23, + Utf8View = 24, + ListView = 25, + LargeListView = 26, }; @@ -110,6 +114,18 @@ static public bool Verify(Google.FlatBuffers.Verifier verifier, byte typeId, uin case Type.RunEndEncoded: result = RunEndEncodedVerify.Verify(verifier, tablePos); break; + case Type.BinaryView: + result = BinaryViewVerify.Verify(verifier, tablePos); + break; + case Type.Utf8View: + result = Utf8ViewVerify.Verify(verifier, tablePos); + break; + case Type.ListView: + result = ListViewVerify.Verify(verifier, tablePos); + break; + case Type.LargeListView: + result = LargeListViewVerify.Verify(verifier, tablePos); + break; default: result = true; break; } diff --git a/csharp/src/Apache.Arrow/Flatbuf/Field.cs b/csharp/src/Apache.Arrow/Flatbuf/Field.cs index c5c6c0a165598..efbc6afb06d03 100644 --- a/csharp/src/Apache.Arrow/Flatbuf/Field.cs +++ b/csharp/src/Apache.Arrow/Flatbuf/Field.cs @@ -57,6 +57,10 @@ internal struct Field : IFlatbufferObject public LargeUtf8 TypeAsLargeUtf8() { return Type().Value; } public LargeList TypeAsLargeList() { return Type().Value; } public RunEndEncoded TypeAsRunEndEncoded() { return Type().Value; } + public BinaryView TypeAsBinaryView() { return Type().Value; } + public Utf8View TypeAsUtf8View() { return Type().Value; } + public ListView TypeAsListView() { return Type().Value; } + public LargeListView TypeAsLargeListView() { return Type().Value; } /// Present only if the field is dictionary encoded. public DictionaryEncoding? Dictionary { get { int o = __p.__offset(12); return o != 0 ? (DictionaryEncoding?)(new DictionaryEncoding()).__assign(__p.__indirect(o + __p.bb_pos), __p.bb) : null; } } /// children apply only to nested data types like Struct, List and Union. For diff --git a/csharp/src/Apache.Arrow/Flatbuf/LargeListView.cs b/csharp/src/Apache.Arrow/Flatbuf/LargeListView.cs new file mode 100644 index 0000000000000..685e91333c38c --- /dev/null +++ b/csharp/src/Apache.Arrow/Flatbuf/LargeListView.cs @@ -0,0 +1,42 @@ +// +// automatically generated by the FlatBuffers compiler, do not modify +// + +namespace Apache.Arrow.Flatbuf +{ + +using global::System; +using global::System.Collections.Generic; +using global::Google.FlatBuffers; + +/// Same as ListView, but with 64-bit offsets and sizes, allowing to represent +/// extremely large data values. +internal struct LargeListView : IFlatbufferObject +{ + private Table __p; + public ByteBuffer ByteBuffer { get { return __p.bb; } } + public static void ValidateVersion() { FlatBufferConstants.FLATBUFFERS_23_5_9(); } + public static LargeListView GetRootAsLargeListView(ByteBuffer _bb) { return GetRootAsLargeListView(_bb, new LargeListView()); } + public static LargeListView GetRootAsLargeListView(ByteBuffer _bb, LargeListView obj) { return (obj.__assign(_bb.GetInt(_bb.Position) + _bb.Position, _bb)); } + public void __init(int _i, ByteBuffer _bb) { __p = new Table(_i, _bb); } + public LargeListView __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; } + + + public static void StartLargeListView(FlatBufferBuilder builder) { builder.StartTable(0); } + public static Offset EndLargeListView(FlatBufferBuilder builder) { + int o = builder.EndTable(); + return new Offset(o); + } +} + + +static internal class LargeListViewVerify +{ + static public bool Verify(Google.FlatBuffers.Verifier verifier, uint tablePos) + { + return verifier.VerifyTableStart(tablePos) + && verifier.VerifyTableEnd(tablePos); + } +} + +} diff --git a/csharp/src/Apache.Arrow/Flatbuf/ListView.cs b/csharp/src/Apache.Arrow/Flatbuf/ListView.cs new file mode 100644 index 0000000000000..d2e54e428524b --- /dev/null +++ b/csharp/src/Apache.Arrow/Flatbuf/ListView.cs @@ -0,0 +1,43 @@ +// +// automatically generated by the FlatBuffers compiler, do not modify +// + +namespace Apache.Arrow.Flatbuf +{ + +using global::System; +using global::System.Collections.Generic; +using global::Google.FlatBuffers; + +/// Represents the same logical types that List can, but contains offsets and +/// sizes allowing for writes in any order and sharing of child values among +/// list values. +internal struct ListView : IFlatbufferObject +{ + private Table __p; + public ByteBuffer ByteBuffer { get { return __p.bb; } } + public static void ValidateVersion() { FlatBufferConstants.FLATBUFFERS_23_5_9(); } + public static ListView GetRootAsListView(ByteBuffer _bb) { return GetRootAsListView(_bb, new ListView()); } + public static ListView GetRootAsListView(ByteBuffer _bb, ListView obj) { return (obj.__assign(_bb.GetInt(_bb.Position) + _bb.Position, _bb)); } + public void __init(int _i, ByteBuffer _bb) { __p = new Table(_i, _bb); } + public ListView __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; } + + + public static void StartListView(FlatBufferBuilder builder) { builder.StartTable(0); } + public static Offset EndListView(FlatBufferBuilder builder) { + int o = builder.EndTable(); + return new Offset(o); + } +} + + +static internal class ListViewVerify +{ + static public bool Verify(Google.FlatBuffers.Verifier verifier, uint tablePos) + { + return verifier.VerifyTableStart(tablePos) + && verifier.VerifyTableEnd(tablePos); + } +} + +} diff --git a/csharp/src/Apache.Arrow/Flatbuf/RecordBatch.cs b/csharp/src/Apache.Arrow/Flatbuf/RecordBatch.cs index 9ab9715165ddc..2df8716bc1655 100644 --- a/csharp/src/Apache.Arrow/Flatbuf/RecordBatch.cs +++ b/csharp/src/Apache.Arrow/Flatbuf/RecordBatch.cs @@ -38,27 +38,57 @@ internal struct RecordBatch : IFlatbufferObject public int BuffersLength { get { int o = __p.__offset(8); return o != 0 ? __p.__vector_len(o) : 0; } } /// Optional compression of the message body public BodyCompression? Compression { get { int o = __p.__offset(10); return o != 0 ? (BodyCompression?)(new BodyCompression()).__assign(__p.__indirect(o + __p.bb_pos), __p.bb) : null; } } + /// Some types such as Utf8View are represented using a variable number of buffers. + /// For each such Field in the pre-ordered flattened logical schema, there will be + /// an entry in variadicBufferCounts to indicate the number of number of variadic + /// buffers which belong to that Field in the current RecordBatch. + /// + /// For example, the schema + /// col1: Struct + /// col2: Utf8View + /// contains two Fields with variadic buffers so variadicBufferCounts will have + /// two entries, the first counting the variadic buffers of `col1.beta` and the + /// second counting `col2`'s. + /// + /// This field may be omitted if and only if the schema contains no Fields with + /// a variable number of buffers, such as BinaryView and Utf8View. + public long VariadicBufferCounts(int j) { int o = __p.__offset(12); return o != 0 ? __p.bb.GetLong(__p.__vector(o) + j * 8) : (long)0; } + public int VariadicBufferCountsLength { get { int o = __p.__offset(12); return o != 0 ? __p.__vector_len(o) : 0; } } +#if ENABLE_SPAN_T + public Span GetVariadicCountsBytes() { return __p.__vector_as_span(12, 8); } +#else + public ArraySegment? GetVariadicCountsBytes() { return __p.__vector_as_arraysegment(12); } +#endif + public long[] GetVariadicCountsArray() { return __p.__vector_as_array(12); } public static Offset CreateRecordBatch(FlatBufferBuilder builder, long length = 0, VectorOffset nodesOffset = default(VectorOffset), VectorOffset buffersOffset = default(VectorOffset), - Offset compressionOffset = default(Offset)) { - builder.StartTable(4); + Offset compressionOffset = default(Offset), + VectorOffset variadicCountsOffset = default(VectorOffset)) { + builder.StartTable(5); RecordBatch.AddLength(builder, length); + RecordBatch.AddVariadicCounts(builder, variadicCountsOffset); RecordBatch.AddCompression(builder, compressionOffset); RecordBatch.AddBuffers(builder, buffersOffset); RecordBatch.AddNodes(builder, nodesOffset); return RecordBatch.EndRecordBatch(builder); } - public static void StartRecordBatch(FlatBufferBuilder builder) { builder.StartTable(4); } + public static void StartRecordBatch(FlatBufferBuilder builder) { builder.StartTable(5); } public static void AddLength(FlatBufferBuilder builder, long length) { builder.AddLong(0, length, 0); } public static void AddNodes(FlatBufferBuilder builder, VectorOffset nodesOffset) { builder.AddOffset(1, nodesOffset.Value, 0); } public static void StartNodesVector(FlatBufferBuilder builder, int numElems) { builder.StartVector(16, numElems, 8); } public static void AddBuffers(FlatBufferBuilder builder, VectorOffset buffersOffset) { builder.AddOffset(2, buffersOffset.Value, 0); } public static void StartBuffersVector(FlatBufferBuilder builder, int numElems) { builder.StartVector(16, numElems, 8); } public static void AddCompression(FlatBufferBuilder builder, Offset compressionOffset) { builder.AddOffset(3, compressionOffset.Value, 0); } + public static void AddVariadicCounts(FlatBufferBuilder builder, VectorOffset variadicCountsOffset) { builder.AddOffset(4, variadicCountsOffset.Value, 0); } + public static VectorOffset CreateVariadicCountsVector(FlatBufferBuilder builder, long[] data) { builder.StartVector(8, data.Length, 8); for (int i = data.Length - 1; i >= 0; i--) builder.AddLong(data[i]); return builder.EndVector(); } + public static VectorOffset CreateVariadicCountsVectorBlock(FlatBufferBuilder builder, long[] data) { builder.StartVector(8, data.Length, 8); builder.Add(data); return builder.EndVector(); } + public static VectorOffset CreateVariadicCountsVectorBlock(FlatBufferBuilder builder, ArraySegment data) { builder.StartVector(8, data.Count, 8); builder.Add(data); return builder.EndVector(); } + public static VectorOffset CreateVariadicCountsVectorBlock(FlatBufferBuilder builder, IntPtr dataPtr, int sizeInBytes) { builder.StartVector(1, sizeInBytes, 1); builder.Add(dataPtr, sizeInBytes); return builder.EndVector(); } + public static void StartVariadicCountsVector(FlatBufferBuilder builder, int numElems) { builder.StartVector(8, numElems, 8); } public static Offset EndRecordBatch(FlatBufferBuilder builder) { int o = builder.EndTable(); return new Offset(o); @@ -75,6 +105,7 @@ static public bool Verify(Google.FlatBuffers.Verifier verifier, uint tablePos) && verifier.VerifyVectorOfData(tablePos, 6 /*Nodes*/, 16 /*FieldNode*/, false) && verifier.VerifyVectorOfData(tablePos, 8 /*Buffers*/, 16 /*Buffer*/, false) && verifier.VerifyTable(tablePos, 10 /*Compression*/, BodyCompressionVerify.Verify, false) + && verifier.VerifyVectorOfData(tablePos, 12 /*VariadicCounts*/, 8 /*long*/, false) && verifier.VerifyTableEnd(tablePos); } } diff --git a/csharp/src/Apache.Arrow/Flatbuf/SparseTensor.cs b/csharp/src/Apache.Arrow/Flatbuf/SparseTensor.cs index 3f9e1de7c00a9..099950fafe4ee 100644 --- a/csharp/src/Apache.Arrow/Flatbuf/SparseTensor.cs +++ b/csharp/src/Apache.Arrow/Flatbuf/SparseTensor.cs @@ -47,6 +47,10 @@ internal struct SparseTensor : IFlatbufferObject public LargeUtf8 TypeAsLargeUtf8() { return Type().Value; } public LargeList TypeAsLargeList() { return Type().Value; } public RunEndEncoded TypeAsRunEndEncoded() { return Type().Value; } + public BinaryView TypeAsBinaryView() { return Type().Value; } + public Utf8View TypeAsUtf8View() { return Type().Value; } + public ListView TypeAsListView() { return Type().Value; } + public LargeListView TypeAsLargeListView() { return Type().Value; } /// The dimensions of the tensor, optionally named. public TensorDim? Shape(int j) { int o = __p.__offset(8); return o != 0 ? (TensorDim?)(new TensorDim()).__assign(__p.__indirect(__p.__vector(o) + j * 4), __p.bb) : null; } public int ShapeLength { get { int o = __p.__offset(8); return o != 0 ? __p.__vector_len(o) : 0; } } diff --git a/csharp/src/Apache.Arrow/Flatbuf/Tensor.cs b/csharp/src/Apache.Arrow/Flatbuf/Tensor.cs index f8c213768a3fc..eb39257d861ca 100644 --- a/csharp/src/Apache.Arrow/Flatbuf/Tensor.cs +++ b/csharp/src/Apache.Arrow/Flatbuf/Tensor.cs @@ -46,6 +46,10 @@ internal struct Tensor : IFlatbufferObject public LargeUtf8 TypeAsLargeUtf8() { return Type().Value; } public LargeList TypeAsLargeList() { return Type().Value; } public RunEndEncoded TypeAsRunEndEncoded() { return Type().Value; } + public BinaryView TypeAsBinaryView() { return Type().Value; } + public Utf8View TypeAsUtf8View() { return Type().Value; } + public ListView TypeAsListView() { return Type().Value; } + public LargeListView TypeAsLargeListView() { return Type().Value; } /// The dimensions of the tensor, optionally named public TensorDim? Shape(int j) { int o = __p.__offset(8); return o != 0 ? (TensorDim?)(new TensorDim()).__assign(__p.__indirect(__p.__vector(o) + j * 4), __p.bb) : null; } public int ShapeLength { get { int o = __p.__offset(8); return o != 0 ? __p.__vector_len(o) : 0; } } diff --git a/csharp/src/Apache.Arrow/Flatbuf/Utf8View.cs b/csharp/src/Apache.Arrow/Flatbuf/Utf8View.cs new file mode 100644 index 0000000000000..e85c5374a9acc --- /dev/null +++ b/csharp/src/Apache.Arrow/Flatbuf/Utf8View.cs @@ -0,0 +1,47 @@ +// +// automatically generated by the FlatBuffers compiler, do not modify +// + +namespace Apache.Arrow.Flatbuf +{ + +using global::System; +using global::System.Collections.Generic; +using global::Google.FlatBuffers; + +/// Logically the same as Utf8, but the internal representation uses a view +/// struct that contains the string length and either the string's entire data +/// inline (for small strings) or an inlined prefix, an index of another buffer, +/// and an offset pointing to a slice in that buffer (for non-small strings). +/// +/// Since it uses a variable number of data buffers, each Field with this type +/// must have a corresponding entry in `variadicBufferCounts`. +internal struct Utf8View : IFlatbufferObject +{ + private Table __p; + public ByteBuffer ByteBuffer { get { return __p.bb; } } + public static void ValidateVersion() { FlatBufferConstants.FLATBUFFERS_23_5_9(); } + public static Utf8View GetRootAsUtf8View(ByteBuffer _bb) { return GetRootAsUtf8View(_bb, new Utf8View()); } + public static Utf8View GetRootAsUtf8View(ByteBuffer _bb, Utf8View obj) { return (obj.__assign(_bb.GetInt(_bb.Position) + _bb.Position, _bb)); } + public void __init(int _i, ByteBuffer _bb) { __p = new Table(_i, _bb); } + public Utf8View __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; } + + + public static void StartUtf8View(FlatBufferBuilder builder) { builder.StartTable(0); } + public static Offset EndUtf8View(FlatBufferBuilder builder) { + int o = builder.EndTable(); + return new Offset(o); + } +} + + +static internal class Utf8ViewVerify +{ + static public bool Verify(Google.FlatBuffers.Verifier verifier, uint tablePos) + { + return verifier.VerifyTableStart(tablePos) + && verifier.VerifyTableEnd(tablePos); + } +} + +} diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs b/csharp/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs index d3115da52cc6c..eb7349a570786 100644 --- a/csharp/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs +++ b/csharp/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs @@ -191,9 +191,7 @@ private List BuildArrays( Field field = schema.GetFieldByIndex(schemaFieldIndex++); Flatbuf.FieldNode fieldNode = recordBatchEnumerator.CurrentNode; - ArrayData arrayData = field.DataType.IsFixedPrimitive() - ? LoadPrimitiveField(version, ref recordBatchEnumerator, field, in fieldNode, messageBuffer, bufferCreator) - : LoadVariableField(version, ref recordBatchEnumerator, field, in fieldNode, messageBuffer, bufferCreator); + ArrayData arrayData = LoadField(version, ref recordBatchEnumerator, field, in fieldNode, messageBuffer, bufferCreator); arrays.Add(ArrowArrayFactory.BuildArray(arrayData)); } while (recordBatchEnumerator.MoveNextNode()); @@ -229,7 +227,7 @@ private IBufferCreator GetBufferCreator(BodyCompression? compression) return new DecompressingBufferCreator(decompressor, _allocator); } - private ArrayData LoadPrimitiveField( + private ArrayData LoadField( MetadataVersion version, ref RecordBatchEnumerator recordBatchEnumerator, Field field, @@ -276,6 +274,16 @@ private ArrayData LoadPrimitiveField( case ArrowTypeId.FixedSizeList: buffers = 1; break; + case ArrowTypeId.String: + case ArrowTypeId.Binary: + case ArrowTypeId.ListView: + buffers = 3; + break; + case ArrowTypeId.StringView: + case ArrowTypeId.BinaryView: + buffers = checked((int)(2 + recordBatchEnumerator.CurrentVariadicCount)); + recordBatchEnumerator.MoveNextVariadicCount(); + break; default: buffers = 2; break; @@ -300,54 +308,6 @@ private ArrayData LoadPrimitiveField( return new ArrayData(field.DataType, fieldLength, fieldNullCount, 0, arrowBuff, children, dictionary?.Data); } - private ArrayData LoadVariableField( - MetadataVersion version, - ref RecordBatchEnumerator recordBatchEnumerator, - Field field, - in Flatbuf.FieldNode fieldNode, - ByteBuffer bodyData, - IBufferCreator bufferCreator) - { - - ArrowBuffer nullArrowBuffer = BuildArrowBuffer(bodyData, recordBatchEnumerator.CurrentBuffer, bufferCreator); - if (!recordBatchEnumerator.MoveNextBuffer()) - { - throw new Exception("Unable to move to the next buffer."); - } - ArrowBuffer offsetArrowBuffer = BuildArrowBuffer(bodyData, recordBatchEnumerator.CurrentBuffer, bufferCreator); - if (!recordBatchEnumerator.MoveNextBuffer()) - { - throw new Exception("Unable to move to the next buffer."); - } - ArrowBuffer valueArrowBuffer = BuildArrowBuffer(bodyData, recordBatchEnumerator.CurrentBuffer, bufferCreator); - recordBatchEnumerator.MoveNextBuffer(); - - int fieldLength = (int)fieldNode.Length; - int fieldNullCount = (int)fieldNode.NullCount; - - if (fieldLength < 0) - { - throw new InvalidDataException("Field length must be >= 0"); // TODO: Localize exception message - } - - if (fieldNullCount < 0) - { - throw new InvalidDataException("Null count length must be >= 0"); //TODO: Localize exception message - } - - ArrowBuffer[] arrowBuff = new[] { nullArrowBuffer, offsetArrowBuffer, valueArrowBuffer }; - ArrayData[] children = GetChildren(version, ref recordBatchEnumerator, field, bodyData, bufferCreator); - - IArrowArray dictionary = null; - if (field.DataType.TypeId == ArrowTypeId.Dictionary) - { - long id = DictionaryMemo.GetId(field); - dictionary = DictionaryMemo.GetDictionary(id); - } - - return new ArrayData(field.DataType, fieldLength, fieldNullCount, 0, arrowBuff, children, dictionary?.Data); - } - private ArrayData[] GetChildren( MetadataVersion version, ref RecordBatchEnumerator recordBatchEnumerator, @@ -365,11 +325,7 @@ private ArrayData[] GetChildren( Flatbuf.FieldNode childFieldNode = recordBatchEnumerator.CurrentNode; Field childField = type.Fields[index]; - ArrayData child = childField.DataType.IsFixedPrimitive() - ? LoadPrimitiveField(version, ref recordBatchEnumerator, childField, in childFieldNode, bodyData, bufferCreator) - : LoadVariableField(version, ref recordBatchEnumerator, childField, in childFieldNode, bodyData, bufferCreator); - - children[index] = child; + children[index] = LoadField(version, ref recordBatchEnumerator, childField, in childFieldNode, bodyData, bufferCreator); } return children; } @@ -394,11 +350,14 @@ internal struct RecordBatchEnumerator private Flatbuf.RecordBatch RecordBatch { get; } internal int CurrentBufferIndex { get; private set; } internal int CurrentNodeIndex { get; private set; } + internal int CurrentVariadicCountIndex { get; private set; } internal Flatbuf.Buffer CurrentBuffer => RecordBatch.Buffers(CurrentBufferIndex).GetValueOrDefault(); internal Flatbuf.FieldNode CurrentNode => RecordBatch.Nodes(CurrentNodeIndex).GetValueOrDefault(); + internal long CurrentVariadicCount => RecordBatch.VariadicBufferCounts(CurrentVariadicCountIndex); + internal bool MoveNextBuffer() { return ++CurrentBufferIndex < RecordBatch.BuffersLength; @@ -409,11 +368,17 @@ internal bool MoveNextNode() return ++CurrentNodeIndex < RecordBatch.NodesLength; } + internal bool MoveNextVariadicCount() + { + return ++CurrentVariadicCountIndex < RecordBatch.VariadicBufferCountsLength; + } + internal RecordBatchEnumerator(in Flatbuf.RecordBatch recordBatch) { RecordBatch = recordBatch; CurrentBufferIndex = 0; CurrentNodeIndex = 0; + CurrentVariadicCountIndex = 0; } } } diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs index 5f490019b2133..07d1dcfdb171d 100644 --- a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs +++ b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs @@ -54,9 +54,12 @@ internal class ArrowRecordBatchFlatBufferBuilder : IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, + IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, + IArrowArrayVisitor, IArrowArrayVisitor, + IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, @@ -81,6 +84,7 @@ public Buffer(ArrowBuffer buffer, int offset) public IReadOnlyList Buffers => _buffers; + public List VariadicCounts { get; private set; } public int TotalLength { get; private set; } public ArrowRecordBatchFlatBufferBuilder() @@ -121,6 +125,15 @@ public void Visit(ListArray array) array.Values.Accept(this); } + public void Visit(ListViewArray array) + { + _buffers.Add(CreateBuffer(array.NullBitmapBuffer)); + _buffers.Add(CreateBuffer(array.ValueOffsetsBuffer)); + _buffers.Add(CreateBuffer(array.SizesBuffer)); + + array.Values.Accept(this); + } + public void Visit(FixedSizeListArray array) { _buffers.Add(CreateBuffer(array.NullBitmapBuffer)); @@ -130,6 +143,8 @@ public void Visit(FixedSizeListArray array) public void Visit(StringArray array) => Visit(array as BinaryArray); + public void Visit(StringViewArray array) => Visit(array as BinaryViewArray); + public void Visit(BinaryArray array) { _buffers.Add(CreateBuffer(array.NullBitmapBuffer)); @@ -137,6 +152,18 @@ public void Visit(BinaryArray array) _buffers.Add(CreateBuffer(array.ValueBuffer)); } + public void Visit(BinaryViewArray array) + { + _buffers.Add(CreateBuffer(array.NullBitmapBuffer)); + _buffers.Add(CreateBuffer(array.ViewsBuffer)); + for (int i = 0; i < array.DataBufferCount; i++) + { + _buffers.Add(CreateBuffer(array.DataBuffer(i))); + } + VariadicCounts = VariadicCounts ?? new List(); + VariadicCounts.Add(array.DataBufferCount); + } + public void Visit(FixedSizeBinaryArray array) { _buffers.Add(CreateBuffer(array.NullBitmapBuffer)); @@ -328,7 +355,7 @@ private protected void WriteRecordBatchInternal(RecordBatch recordBatch) HasWrittenDictionaryBatch = true; } - (ArrowRecordBatchFlatBufferBuilder recordBatchBuilder, VectorOffset fieldNodesVectorOffset) = + (ArrowRecordBatchFlatBufferBuilder recordBatchBuilder, VectorOffset fieldNodesVectorOffset, VectorOffset variadicCountsOffset) = PreparingWritingRecordBatch(recordBatch); VectorOffset buffersVectorOffset = Builder.EndVector(); @@ -339,7 +366,9 @@ private protected void WriteRecordBatchInternal(RecordBatch recordBatch) Offset recordBatchOffset = Flatbuf.RecordBatch.CreateRecordBatch(Builder, recordBatch.Length, fieldNodesVectorOffset, - buffersVectorOffset); + buffersVectorOffset, + default, + variadicCountsOffset); long metadataLength = WriteMessage(Flatbuf.MessageHeader.RecordBatch, recordBatchOffset, recordBatchBuilder.TotalLength); @@ -367,7 +396,7 @@ private protected async Task WriteRecordBatchInternalAsync(RecordBatch recordBat HasWrittenDictionaryBatch = true; } - (ArrowRecordBatchFlatBufferBuilder recordBatchBuilder, VectorOffset fieldNodesVectorOffset) = + (ArrowRecordBatchFlatBufferBuilder recordBatchBuilder, VectorOffset fieldNodesVectorOffset, VectorOffset variadicCountsOffset) = PreparingWritingRecordBatch(recordBatch); VectorOffset buffersVectorOffset = Builder.EndVector(); @@ -378,7 +407,9 @@ private protected async Task WriteRecordBatchInternalAsync(RecordBatch recordBat Offset recordBatchOffset = Flatbuf.RecordBatch.CreateRecordBatch(Builder, recordBatch.Length, fieldNodesVectorOffset, - buffersVectorOffset); + buffersVectorOffset, + default, + variadicCountsOffset); long metadataLength = await WriteMessageAsync(Flatbuf.MessageHeader.RecordBatch, recordBatchOffset, recordBatchBuilder.TotalLength, @@ -451,12 +482,12 @@ private async ValueTask WriteBufferDataAsync(IReadOnlyList PreparingWritingRecordBatch(RecordBatch recordBatch) + private Tuple PreparingWritingRecordBatch(RecordBatch recordBatch) { return PreparingWritingRecordBatch(recordBatch.Schema.FieldsList, recordBatch.ArrayList); } - private Tuple PreparingWritingRecordBatch(IReadOnlyList fields, IReadOnlyList arrays) + private Tuple PreparingWritingRecordBatch(IReadOnlyList fields, IReadOnlyList arrays) { Builder.Clear(); @@ -483,6 +514,12 @@ private Tuple PreparingWritingR fieldArray.Accept(recordBatchBuilder); } + VectorOffset variadicCountOffset = default; + if (recordBatchBuilder.VariadicCounts != null) + { + variadicCountOffset = Flatbuf.RecordBatch.CreateVariadicCountsVectorBlock(Builder, recordBatchBuilder.VariadicCounts.ToArray()); + } + IReadOnlyList buffers = recordBatchBuilder.Buffers; Flatbuf.RecordBatch.StartBuffersVector(Builder, buffers.Count); @@ -494,7 +531,7 @@ private Tuple PreparingWritingR buffers[i].Offset, buffers[i].DataBuffer.Length); } - return Tuple.Create(recordBatchBuilder, fieldNodesVectorOffset); + return Tuple.Create(recordBatchBuilder, fieldNodesVectorOffset, variadicCountOffset); } private protected virtual void StartingWritingDictionary() @@ -561,7 +598,7 @@ private protected async Task WriteDictionaryAsync(long id, IArrowType valueType, var arrays = new List { dictionary }; - (ArrowRecordBatchFlatBufferBuilder recordBatchBuilder, VectorOffset fieldNodesVectorOffset) = + (ArrowRecordBatchFlatBufferBuilder recordBatchBuilder, VectorOffset fieldNodesVectorOffset, VectorOffset variadicCountsOffset) = PreparingWritingRecordBatch(fields, arrays); VectorOffset buffersVectorOffset = Builder.EndVector(); @@ -569,7 +606,9 @@ private protected async Task WriteDictionaryAsync(long id, IArrowType valueType, // Serialize record batch Offset recordBatchOffset = Flatbuf.RecordBatch.CreateRecordBatch(Builder, dictionary.Length, fieldNodesVectorOffset, - buffersVectorOffset); + buffersVectorOffset, + default, + variadicCountsOffset); // TODO: Support delta. Offset dictionaryBatchOffset = Flatbuf.DictionaryBatch.CreateDictionaryBatch(Builder, id, recordBatchOffset, false); diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowTypeFlatbufferBuilder.cs b/csharp/src/Apache.Arrow/Ipc/ArrowTypeFlatbufferBuilder.cs index 84ff4f9cc7202..473e18968f8cb 100644 --- a/csharp/src/Apache.Arrow/Ipc/ArrowTypeFlatbufferBuilder.cs +++ b/csharp/src/Apache.Arrow/Ipc/ArrowTypeFlatbufferBuilder.cs @@ -50,9 +50,13 @@ class TypeVisitor : IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, +#if NET5_0_OR_GREATER + IArrowTypeVisitor, +#endif IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, @@ -60,8 +64,10 @@ class TypeVisitor : IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, @@ -106,6 +112,14 @@ public void Visit(BinaryType type) Flatbuf.Binary.EndBinary(Builder)); } + public void Visit(BinaryViewType type) + { + Flatbuf.BinaryView.StartBinaryView(Builder); + Offset offset = Flatbuf.BinaryView.EndBinaryView(Builder); + Result = FieldType.Build( + Flatbuf.Type.BinaryView, offset); + } + public void Visit(ListType type) { Flatbuf.List.StartList(Builder); @@ -114,6 +128,14 @@ public void Visit(ListType type) Flatbuf.List.EndList(Builder)); } + public void Visit(ListViewType type) + { + Flatbuf.ListView.StartListView(Builder); + Result = FieldType.Build( + Flatbuf.Type.ListView, + Flatbuf.ListView.EndListView(Builder)); + } + public void Visit(FixedSizeListType type) { Result = FieldType.Build( @@ -136,6 +158,14 @@ public void Visit(StringType type) Flatbuf.Type.Utf8, offset); } + public void Visit(StringViewType type) + { + Flatbuf.Utf8View.StartUtf8View(Builder); + Offset offset = Flatbuf.Utf8View.EndUtf8View(Builder); + Result = FieldType.Build( + Flatbuf.Type.Utf8View, offset); + } + public void Visit(TimestampType type) { StringOffset timezoneStringOffset = default; @@ -169,6 +199,15 @@ public void Visit(Time32Type type) Flatbuf.Time.CreateTime(Builder, ToFlatBuffer(type.Unit))); } +#if NET5_0_OR_GREATER + public void Visit(HalfFloatType type) + { + Result = FieldType.Build( + Flatbuf.Type.FloatingPoint, + Flatbuf.FloatingPoint.CreateFloatingPoint(Builder, Precision.HALF)); + } +#endif + public void Visit(FloatType type) { Result = FieldType.Build( diff --git a/csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs b/csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs index 633554fc53261..0e6f330aef091 100644 --- a/csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs +++ b/csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs @@ -184,17 +184,27 @@ private static Types.IArrowType GetFieldArrowType(Flatbuf.Field field, Field[] c return Types.IntervalType.FromIntervalUnit(intervalMetadata.Unit.ToArrow()); case Flatbuf.Type.Utf8: return Types.StringType.Default; + case Flatbuf.Type.Utf8View: + return Types.StringViewType.Default; case Flatbuf.Type.FixedSizeBinary: Flatbuf.FixedSizeBinary fixedSizeBinaryMetadata = field.Type().Value; return new Types.FixedSizeBinaryType(fixedSizeBinaryMetadata.ByteWidth); case Flatbuf.Type.Binary: return Types.BinaryType.Default; + case Flatbuf.Type.BinaryView: + return Types.BinaryViewType.Default; case Flatbuf.Type.List: if (childFields == null || childFields.Length != 1) { throw new InvalidDataException($"List type must have exactly one child."); } return new Types.ListType(childFields[0]); + case Flatbuf.Type.ListView: + if (childFields == null || childFields.Length != 1) + { + throw new InvalidDataException($"List view type must have exactly one child."); + } + return new Types.ListViewType(childFields[0]); case Flatbuf.Type.FixedSizeList: if (childFields == null || childFields.Length != 1) { diff --git a/csharp/src/Apache.Arrow/Scalars/BinaryView.cs b/csharp/src/Apache.Arrow/Scalars/BinaryView.cs new file mode 100644 index 0000000000000..eaba89c7a3a8e --- /dev/null +++ b/csharp/src/Apache.Arrow/Scalars/BinaryView.cs @@ -0,0 +1,111 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +namespace Apache.Arrow.Scalars +{ + [StructLayout(LayoutKind.Explicit)] + public unsafe struct BinaryView : IEquatable + { + public const int PrefixLength = 4; + public const int MaxInlineLength = 12; + + [FieldOffset(0)] + public readonly int Length; + + [FieldOffset(4)] + internal readonly int _prefix; + + [FieldOffset(8)] + internal readonly int _bufferIndex; + + [FieldOffset(12)] + internal readonly int _bufferOffset; + + [FieldOffset(4)] + internal fixed byte _inline[MaxInlineLength]; + + public unsafe BinaryView(ReadOnlySpan inline) : this() + { + if (inline.Length > MaxInlineLength) + { + throw new ArgumentException("invalid inline data length", nameof(inline)); + } + + Length = inline.Length; + fixed (byte* dest = _inline) + fixed (byte* src = inline) + { + Buffer.MemoryCopy(src, dest, MaxInlineLength, inline.Length); + } + } + + public BinaryView(int length, ReadOnlySpan prefix, int bufferIndex, int bufferOffset) + { + if (length < MaxInlineLength) + { + throw new ArgumentException("invalid length", nameof(length)); + } + if (prefix.Length != PrefixLength) + { + throw new ArgumentException("invalid prefix length", nameof(prefix)); + } + + Length = length; + _bufferIndex = bufferIndex; + _bufferOffset = bufferOffset; + _prefix = prefix.CastTo()[0]; + } + + private BinaryView(int length, int prefix, int bufferIndex, int offset) + { + Length = length; + _prefix = prefix; + _bufferIndex = bufferIndex; + _bufferOffset = offset; + } + + public bool IsInline => Length <= MaxInlineLength; + +#if NET5_0_OR_GREATER + public ReadOnlySpan Bytes => MemoryMarshal.CreateReadOnlySpan(ref Unsafe.AsRef(_inline[0]), IsInline ? Length : PrefixLength); +#else + public unsafe ReadOnlySpan Bytes => new ReadOnlySpan(Unsafe.AsPointer(ref _inline[0]), IsInline ? Length : PrefixLength); +#endif + + public int BufferIndex => IsInline ? -1 : _bufferIndex; + + public int BufferOffset => IsInline ? -1 : _bufferOffset; + + public override int GetHashCode() => Length ^ _prefix ^ _bufferIndex ^ _bufferOffset; + + public override bool Equals(object obj) + { + BinaryView? other = obj as BinaryView?; + return other != null && Equals(other.Value); + } + + public bool Equals(BinaryView other) => + Length == other.Length && _prefix == other._prefix && _bufferIndex == other._bufferIndex && _bufferOffset == other._bufferOffset; + + internal BinaryView AdjustBufferIndex(int bufferOffset) + { + return new BinaryView(Length, _prefix, _bufferIndex + bufferOffset, _bufferOffset); + } + } +} diff --git a/csharp/src/Apache.Arrow/Types/BinaryViewType.cs b/csharp/src/Apache.Arrow/Types/BinaryViewType.cs new file mode 100644 index 0000000000000..f5cfc034dc967 --- /dev/null +++ b/csharp/src/Apache.Arrow/Types/BinaryViewType.cs @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +namespace Apache.Arrow.Types +{ + public class BinaryViewType: ArrowType + { + public static readonly BinaryViewType Default = new BinaryViewType(); + + public override ArrowTypeId TypeId => ArrowTypeId.BinaryView; + public override string Name => "binaryview"; + + public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor); + } +} diff --git a/csharp/src/Apache.Arrow/Types/IArrowType.cs b/csharp/src/Apache.Arrow/Types/IArrowType.cs index 5e107813be828..cf520391fe1e6 100644 --- a/csharp/src/Apache.Arrow/Types/IArrowType.cs +++ b/csharp/src/Apache.Arrow/Types/IArrowType.cs @@ -50,6 +50,9 @@ public enum ArrowTypeId FixedSizeList, Duration, RecordBatch, + BinaryView, + StringView, + ListView, } public interface IArrowType diff --git a/csharp/src/Apache.Arrow/Types/ListViewType.cs b/csharp/src/Apache.Arrow/Types/ListViewType.cs new file mode 100644 index 0000000000000..ecf745723c4ae --- /dev/null +++ b/csharp/src/Apache.Arrow/Types/ListViewType.cs @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +namespace Apache.Arrow.Types +{ + public sealed class ListViewType : NestedType + { + public override ArrowTypeId TypeId => ArrowTypeId.ListView; + public override string Name => "listview"; + + public Field ValueField => Fields[0]; + + public IArrowType ValueDataType => Fields[0].DataType; + + public ListViewType(Field valueField) + : base(valueField) { } + + public ListViewType(IArrowType valueDataType) + : this(new Field("item", valueDataType, true)) { } + + public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor); + } +} diff --git a/csharp/src/Apache.Arrow/Types/StringViewType.cs b/csharp/src/Apache.Arrow/Types/StringViewType.cs new file mode 100644 index 0000000000000..0c539a56b03b5 --- /dev/null +++ b/csharp/src/Apache.Arrow/Types/StringViewType.cs @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +namespace Apache.Arrow.Types +{ + public sealed class StringViewType : ArrowType + { + public static StringViewType Default = new StringViewType(); + + public override ArrowTypeId TypeId => ArrowTypeId.StringView; + public override string Name => "utf8view"; + + public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor); + } +} diff --git a/csharp/test/Apache.Arrow.Benchmarks/ArrowWriterBenchmark.cs b/csharp/test/Apache.Arrow.Benchmarks/ArrowWriterBenchmark.cs index c791c9969356a..f35c2a5d78d79 100644 --- a/csharp/test/Apache.Arrow.Benchmarks/ArrowWriterBenchmark.cs +++ b/csharp/test/Apache.Arrow.Benchmarks/ArrowWriterBenchmark.cs @@ -38,7 +38,7 @@ public class ArrowWriterBenchmark [GlobalSetup] public void GlobalSetup() { - _batch = TestData.CreateSampleRecordBatch(BatchLength, ColumnSetCount, false); + _batch = TestData.CreateSampleRecordBatch(BatchLength, ColumnSetCount); _memoryStream = new MemoryStream(); } diff --git a/csharp/test/Apache.Arrow.IntegrationTest/JsonFile.cs b/csharp/test/Apache.Arrow.IntegrationTest/JsonFile.cs index f3fe73588a7bb..31a5676f01315 100644 --- a/csharp/test/Apache.Arrow.IntegrationTest/JsonFile.cs +++ b/csharp/test/Apache.Arrow.IntegrationTest/JsonFile.cs @@ -21,6 +21,7 @@ using System.Numerics; using System.Text; using System.Text.Json; +using System.Text.Json.Nodes; using System.Text.Json.Serialization; using System.Threading.Tasks; using Apache.Arrow.Arrays; @@ -175,7 +176,9 @@ private static IArrowType ToArrowType(JsonArrowType type, Field[] children) "floatingpoint" => ToFloatingPointArrowType(type), "decimal" => ToDecimalArrowType(type), "binary" => BinaryType.Default, + "binaryview" => BinaryViewType.Default, "utf8" => StringType.Default, + "utf8view" => StringViewType.Default, "fixedsizebinary" => new FixedSizeBinaryType(type.ByteWidth), "date" => ToDateArrowType(type), "time" => ToTimeArrowType(type), @@ -184,6 +187,7 @@ private static IArrowType ToArrowType(JsonArrowType type, Field[] children) "interval_mdn" => ToIntervalArrowType(type), "timestamp" => ToTimestampArrowType(type), "list" => ToListArrowType(type, children), + "listview" => ToListViewArrowType(type, children), "fixedsizelist" => ToFixedSizeListArrowType(type, children), "struct" => ToStructArrowType(type, children), "union" => ToUnionArrowType(type, children), @@ -294,6 +298,11 @@ private static IArrowType ToListArrowType(JsonArrowType type, Field[] children) return new ListType(children[0]); } + private static IArrowType ToListViewArrowType(JsonArrowType type, Field[] children) + { + return new ListViewType(children[0]); + } + private static IArrowType ToFixedSizeListArrowType(JsonArrowType type, Field[] children) { return new FixedSizeListType(children[0], type.ListSize); @@ -451,9 +460,12 @@ private class ArrayCreator : IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, @@ -652,6 +664,38 @@ public void Visit(StringType type) Array = new StringArray(JsonFieldData.Count, offsetBuffer, valueBuffer, validityBuffer, nullCount); } + public void Visit(StringViewType type) + { + ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); + + // ArrowBuffer viewsBuffer = GetViewsBuffer(); + ArrowBuffer viewsBuffer = ArrowBuffer.Empty; + if (JsonFieldData.Views != null) + { + ArrowBuffer.Builder viewBuilder = new ArrowBuffer.Builder(JsonFieldData.Views.Count); + foreach (JsonView jsonView in JsonFieldData.Views) + { + BinaryView view = (jsonView.BufferIndex == null) ? + new BinaryView(Encoding.UTF8.GetBytes(jsonView.Inlined)) : + new BinaryView(jsonView.Size, Convert.FromHexString(jsonView.PrefixHex), jsonView.BufferIndex.Value, jsonView.Offset.Value); + viewBuilder.Append(view); + } + viewsBuffer = viewBuilder.Build(); + } + + int bufferCount = JsonFieldData.VariadicDataBuffers?.Count ?? 0; + ArrowBuffer[] buffers = new ArrowBuffer[2 + bufferCount]; + buffers[0] = validityBuffer; + buffers[1] = viewsBuffer; + for (int i = 0; i < bufferCount; i++) + { + buffers[i + 2] = new ArrowBuffer(Convert.FromHexString(JsonFieldData.VariadicDataBuffers[i])).Clone(); + } + + ArrayData arrayData = new ArrayData(type, JsonFieldData.Count, nullCount, 0, buffers); + Array = new StringViewArray(arrayData); + } + public void Visit(BinaryType type) { ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); @@ -671,6 +715,38 @@ public void Visit(BinaryType type) Array = new BinaryArray(arrayData); } + public void Visit(BinaryViewType type) + { + ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); + + // ArrowBuffer viewsBuffer = GetViewsBuffer(); + ArrowBuffer viewsBuffer = ArrowBuffer.Empty; + if (JsonFieldData.Views != null) + { + ArrowBuffer.Builder viewBuilder = new ArrowBuffer.Builder(JsonFieldData.Views.Count); + foreach (JsonView jsonView in JsonFieldData.Views) + { + BinaryView view = (jsonView.BufferIndex == null) ? + new BinaryView(Convert.FromHexString(jsonView.Inlined)) : + new BinaryView(jsonView.Size, Convert.FromHexString(jsonView.PrefixHex), jsonView.BufferIndex.Value, jsonView.Offset.Value); + viewBuilder.Append(view); + } + viewsBuffer = viewBuilder.Build(); + } + + int bufferCount = JsonFieldData.VariadicDataBuffers?.Count ?? 0; + ArrowBuffer[] buffers = new ArrowBuffer[2 + bufferCount]; + buffers[0] = validityBuffer; + buffers[1] = viewsBuffer; + for (int i = 0; i < bufferCount; i++) + { + buffers[i + 2] = new ArrowBuffer(Convert.FromHexString(JsonFieldData.VariadicDataBuffers[i])).Clone(); + } + + ArrayData arrayData = new ArrayData(type, JsonFieldData.Count, nullCount, 0, buffers); + Array = new BinaryViewArray(arrayData); + } + public void Visit(FixedSizeBinaryType type) { ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); @@ -704,6 +780,22 @@ public void Visit(ListType type) Array = new ListArray(arrayData); } + public void Visit(ListViewType type) + { + ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); + ArrowBuffer offsetBuffer = GetOffsetBuffer(); + ArrowBuffer sizeBuffer = GetSizeBuffer(); + + var data = JsonFieldData; + JsonFieldData = data.Children[0]; + type.ValueDataType.Accept(this); + JsonFieldData = data; + + ArrayData arrayData = new ArrayData(type, JsonFieldData.Count, nullCount, 0, + new[] { validityBuffer, offsetBuffer, sizeBuffer }, new[] { Array.Data }); + Array = new ListViewArray(arrayData); + } + public void Visit(FixedSizeListType type) { ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); @@ -878,11 +970,18 @@ private void GenerateArray(Func valueOffsets = new ArrowBuffer.Builder(JsonFieldData.Offset.Length); - valueOffsets.AppendRange(JsonFieldData.Offset); + ArrowBuffer.Builder valueOffsets = new ArrowBuffer.Builder(JsonFieldData.Offset.Count); + valueOffsets.AppendRange(JsonFieldData.IntOffset); return valueOffsets.Build(default); } + private ArrowBuffer GetSizeBuffer() + { + ArrowBuffer.Builder valueSizes = new ArrowBuffer.Builder(JsonFieldData.Size.Count); + valueSizes.AppendRange(JsonFieldData.IntSize); + return valueSizes.Build(default); + } + private ArrowBuffer GetTypeIdBuffer() { ArrowBuffer.Builder typeIds = new ArrowBuffer.Builder(JsonFieldData.TypeId.Length); @@ -920,10 +1019,61 @@ public class JsonFieldData public string Name { get; set; } public int Count { get; set; } public bool[] Validity { get; set; } - public int[] Offset { get; set; } + public JsonArray Offset { get; set; } + + [JsonPropertyName("SIZE")] + public JsonArray Size { get; set; } public int[] TypeId { get; set; } public JsonElement Data { get; set; } public List Children { get; set; } + + [JsonPropertyName("VIEWS")] + public List Views { get; set; } + + [JsonPropertyName("VARIADIC_DATA_BUFFERS")] + public List VariadicDataBuffers { get; set; } + + [JsonIgnore] + public IEnumerable IntOffset + { + get { return Offset.Select(GetInt); } + } + + [JsonIgnore] + public IEnumerable IntSize + { + get { return Size.Select(GetInt); } + } + + static int GetInt(JsonNode node) + { + try + { + return node.GetValue(); + } + catch + { + return int.Parse(node.GetValue()); + } + } + } + + public class JsonView + { + [JsonPropertyName("SIZE")] + public int Size { get; set; } + + [JsonPropertyName("INLINED")] + public string Inlined { get; set; } + + [JsonPropertyName("PREFIX_HEX")] + public string PrefixHex { get; set; } + + [JsonPropertyName("BUFFER_INDEX")] + public int? BufferIndex { get; set; } + + [JsonPropertyName("OFFSET")] + public int? Offset { get; set; } } internal sealed class ValidityConverter : JsonConverter diff --git a/csharp/test/Apache.Arrow.IntegrationTest/Properties/launchSettings.json b/csharp/test/Apache.Arrow.IntegrationTest/Properties/launchSettings.json new file mode 100644 index 0000000000000..46bdeff290e17 --- /dev/null +++ b/csharp/test/Apache.Arrow.IntegrationTest/Properties/launchSettings.json @@ -0,0 +1,8 @@ +{ + "profiles": { + "Apache.Arrow.IntegrationTest": { + "commandName": "Project", + "commandLineArgs": "--mode validate -j C:\\Users\\curt\\AppData\\Local\\Temp\\arrow-integration-9_cov7dz\\generated_binary_view.json -a C:\\Users\\curt\\AppData\\Local\\Temp\\tmpxicbzqpn\\460a151e_generated_binary_view.json_as_file" + } + } +} \ No newline at end of file diff --git a/csharp/test/Apache.Arrow.Tests/ArrowArrayConcatenatorTests.cs b/csharp/test/Apache.Arrow.Tests/ArrowArrayConcatenatorTests.cs index 137dc16d473a4..25ef289f0dc25 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowArrayConcatenatorTests.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowArrayConcatenatorTests.cs @@ -64,13 +64,16 @@ private static IEnumerable, IArrowArray>> GenerateTestDa FloatType.Default, DoubleType.Default, BinaryType.Default, + BinaryViewType.Default, StringType.Default, + StringViewType.Default, Date32Type.Default, Date64Type.Default, TimestampType.Default, new Decimal128Type(14, 10), new Decimal256Type(14,10), new ListType(Int64Type.Default), + new ListViewType(Int64Type.Default), new StructType(new List{ new Field.Builder().Name("Strings").DataType(StringType.Default).Nullable(true).Build(), new Field.Builder().Name("Ints").DataType(Int32Type.Default).Nullable(true).Build() @@ -122,7 +125,9 @@ private class TestDataGenerator : IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, @@ -131,6 +136,7 @@ private class TestDataGenerator : IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, @@ -368,6 +374,34 @@ public void Visit(BinaryType type) ExpectedArray = resultBuilder.Build(); } + public void Visit(BinaryViewType type) + { + BinaryViewArray.Builder resultBuilder = new BinaryViewArray.Builder().Reserve(_baseDataTotalElementCount); + + for (int i = 0; i < _baseDataListCount; i++) + { + List dataList = _baseData[i]; + BinaryViewArray.Builder builder = new BinaryViewArray.Builder().Reserve(dataList.Count); + + foreach (byte? value in dataList) + { + if (value.HasValue) + { + builder.Append(value.Value); + resultBuilder.Append(value.Value); + } + else + { + builder.AppendNull(); + resultBuilder.AppendNull(); + } + } + TestTargetArrayList.Add(builder.Build()); + } + + ExpectedArray = resultBuilder.Build(); + } + public void Visit(StringType type) { StringArray.Builder resultBuilder = new StringArray.Builder().Reserve(_baseDataTotalElementCount); @@ -388,6 +422,26 @@ public void Visit(StringType type) ExpectedArray = resultBuilder.Build(); } + public void Visit(StringViewType type) + { + StringViewArray.Builder resultBuilder = new StringViewArray.Builder().Reserve(_baseDataTotalElementCount); + + for (int i = 0; i < _baseDataListCount; i++) + { + List dataList = _baseData[i]; + StringViewArray.Builder builder = new StringViewArray.Builder().Reserve(dataList.Count); + + foreach (string value in dataList.Select(_ => _.ToString() ?? null)) + { + builder.Append(value); + resultBuilder.Append(value); + } + TestTargetArrayList.Add(builder.Build()); + } + + ExpectedArray = resultBuilder.Build(); + } + public void Visit(ListType type) { ListArray.Builder resultBuilder = new ListArray.Builder(type.ValueDataType).Reserve(_baseDataTotalElementCount); @@ -423,6 +477,41 @@ public void Visit(ListType type) ExpectedArray = resultBuilder.Build(); } + public void Visit(ListViewType type) + { + ListViewArray.Builder resultBuilder = new ListViewArray.Builder(type.ValueDataType).Reserve(_baseDataTotalElementCount); + Int64Array.Builder resultValueBuilder = (Int64Array.Builder)resultBuilder.ValueBuilder.Reserve(_baseDataTotalElementCount); + + for (int i = 0; i < _baseDataListCount; i++) + { + List dataList = _baseData[i]; + + ListViewArray.Builder builder = new ListViewArray.Builder(type.ValueField).Reserve(dataList.Count); + Int64Array.Builder valueBuilder = (Int64Array.Builder)builder.ValueBuilder.Reserve(dataList.Count); + + foreach (long? value in dataList) + { + if (value.HasValue) + { + builder.Append(); + resultBuilder.Append(); + + valueBuilder.Append(value.Value); + resultValueBuilder.Append(value.Value); + } + else + { + builder.AppendNull(); + resultBuilder.AppendNull(); + } + } + + TestTargetArrayList.Add(builder.Build()); + } + + ExpectedArray = resultBuilder.Build(); + } + public void Visit(FixedSizeListType type) { FixedSizeListArray.Builder resultBuilder = new FixedSizeListArray.Builder(type.ValueDataType, type.ListSize).Reserve(_baseDataTotalElementCount); diff --git a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs index 2aaffe7835258..10315ff287c0b 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs @@ -20,6 +20,7 @@ using System.Threading.Tasks; using Apache.Arrow.Arrays; using Xunit; +using System.Diagnostics; namespace Apache.Arrow.Tests { @@ -90,10 +91,13 @@ private class ArrayComparer : IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, + IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, + IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, + IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, @@ -136,12 +140,15 @@ public ArrayComparer(IArrowArray expectedArray, bool strictCompare) public void Visit(DayTimeIntervalArray array) => CompareArrays(array); public void Visit(MonthDayNanosecondIntervalArray array) => CompareArrays(array); public void Visit(ListArray array) => CompareArrays(array); + public void Visit(ListViewArray array) => CompareArrays(array); public void Visit(FixedSizeListArray array) => CompareArrays(array); public void Visit(FixedSizeBinaryArray array) => CompareArrays(array); public void Visit(Decimal128Array array) => CompareArrays(array); public void Visit(Decimal256Array array) => CompareArrays(array); public void Visit(StringArray array) => CompareBinaryArrays(array); + public void Visit(StringViewArray array) => CompareVariadicArrays(array); public void Visit(BinaryArray array) => CompareBinaryArrays(array); + public void Visit(BinaryViewArray array) => CompareVariadicArrays(array); public void Visit(StructArray array) { @@ -230,6 +237,32 @@ private void CompareBinaryArrays(BinaryArray actualArray) } } + private void CompareVariadicArrays(BinaryViewArray actualArray) + where T : IArrowArray + { + Assert.IsAssignableFrom(_expectedArray); + Assert.IsAssignableFrom(actualArray); + + var expectedArray = (BinaryViewArray)_expectedArray; + + actualArray.Data.DataType.Accept(_arrayTypeComparer); + + Assert.Equal(expectedArray.Length, actualArray.Length); + Assert.Equal(expectedArray.NullCount, actualArray.NullCount); + Assert.Equal(expectedArray.Offset, actualArray.Offset); + + CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, actualArray.NullBitmapBuffer); + + Assert.True(expectedArray.Views.SequenceEqual(actualArray.Views)); + + for (int i = 0; i < expectedArray.Length; i++) + { + Assert.True( + expectedArray.GetBytes(i).SequenceEqual(actualArray.GetBytes(i)), + $"BinaryArray values do not match at index {i}."); + } + } + private void CompareArrays(FixedSizeBinaryArray actualArray) { Assert.IsAssignableFrom(_expectedArray); @@ -346,6 +379,34 @@ private void CompareArrays(ListArray actualArray) actualArray.Values.Accept(new ArrayComparer(expectedArray.Values, _strictCompare)); } + private void CompareArrays(ListViewArray actualArray) + { + Assert.IsAssignableFrom(_expectedArray); + ListViewArray expectedArray = (ListViewArray)_expectedArray; + + actualArray.Data.DataType.Accept(_arrayTypeComparer); + + Assert.Equal(expectedArray.Length, actualArray.Length); + Assert.Equal(expectedArray.NullCount, actualArray.NullCount); + Assert.Equal(expectedArray.Offset, actualArray.Offset); + + CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, actualArray.NullBitmapBuffer); + + if (_strictCompare) + { + Assert.True(expectedArray.ValueOffsetsBuffer.Span.SequenceEqual(actualArray.ValueOffsetsBuffer.Span)); + Assert.True(expectedArray.SizesBuffer.Span.SequenceEqual(actualArray.SizesBuffer.Span)); + } + else + { + int length = expectedArray.Length * sizeof(int); + Assert.True(expectedArray.ValueOffsetsBuffer.Span.Slice(0, length).SequenceEqual(actualArray.ValueOffsetsBuffer.Span.Slice(0, length))); + Assert.True(expectedArray.SizesBuffer.Span.Slice(0, length).SequenceEqual(actualArray.SizesBuffer.Span.Slice(0, length))); + } + + actualArray.Values.Accept(new ArrayComparer(expectedArray.Values, _strictCompare)); + } + private void CompareArrays(FixedSizeListArray actualArray) { Assert.IsAssignableFrom(_expectedArray); diff --git a/csharp/test/Apache.Arrow.Tests/BinaryViewTests.cs b/csharp/test/Apache.Arrow.Tests/BinaryViewTests.cs new file mode 100644 index 0000000000000..eb617b4dedc75 --- /dev/null +++ b/csharp/test/Apache.Arrow.Tests/BinaryViewTests.cs @@ -0,0 +1,89 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using Apache.Arrow.Scalars; +using Xunit; + +namespace Apache.Arrow.Tests +{ + public class BinaryViewTests + { + private static readonly byte[] empty = new byte[0]; + private static readonly byte[] oneByte = new byte[1]; + private static readonly byte[] fourBytes = new byte[] { 1, 2, 3, 4 }; + private static readonly byte[] fiveBytes = new byte[] { 5, 4, 3, 2, 1 }; + private static readonly byte[] twelveBytes = new byte[] { 1, 2, 3, 4, 8, 7, 6, 5, 9, 10, 11, 12 }; + private static readonly byte[] thirteenBytes = new byte[13]; + + [Fact] + public void Equality() + { + BinaryView one = new BinaryView(oneByte); + BinaryView four = new BinaryView(fourBytes); + BinaryView twelve = new BinaryView(twelveBytes); + BinaryView twelvePlus = new BinaryView(13, fourBytes, 0, 0); + Assert.Equal(one, one); + Assert.NotEqual(one, four); + Assert.NotEqual(four, twelve); + Assert.NotEqual(four, twelvePlus); + } + + [Fact] + public void ConstructorThrows() + { + Assert.Throws(() => new BinaryView(thirteenBytes)); + Assert.Throws(() => new BinaryView(20, empty, 0, 0)); + Assert.Throws(() => new BinaryView(20, fiveBytes, 0, 0)); + Assert.Throws(() => new BinaryView(13, thirteenBytes, 0, 0)); + Assert.Throws(() => new BinaryView(4, fourBytes, 0, 0)); + } + + [Fact] + public void ConstructInline() + { + BinaryView zero = new BinaryView(empty); + Assert.Equal(-1, zero.BufferIndex); + Assert.Equal(-1, zero.BufferOffset); + Assert.Equal(0, zero.Length); + Assert.Equal(0, zero.Bytes.Length); + + BinaryView one = new BinaryView(oneByte); + Assert.Equal(-1, one.BufferIndex); + Assert.Equal(-1, one.BufferOffset); + Assert.Equal(1, one.Length); + Assert.Equal(1, one.Bytes.Length); + Assert.Equal((byte)0, one.Bytes[0]); + + BinaryView twelve = new BinaryView(twelveBytes); + Assert.Equal(-1, one.BufferIndex); + Assert.Equal(-1, one.BufferOffset); + Assert.Equal(12, twelve.Length); + Assert.Equal(12, twelve.Bytes.Length); + Assert.Equal((byte)8, twelve.Bytes[4]); + } + + [Fact] + public void ConstructPrefix() + { + BinaryView four = new BinaryView(14, fourBytes, 2, 3); + Assert.Equal(2, four.BufferIndex); + Assert.Equal(3, four.BufferOffset); + Assert.Equal(14, four.Length); + Assert.Equal(4, four.Bytes.Length); + Assert.Equal((byte)2, four.Bytes[1]); + } + } +} diff --git a/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs b/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs index 83902d8d93c70..274434e4bab09 100644 --- a/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs +++ b/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs @@ -741,7 +741,9 @@ public unsafe void ExportBatch() [SkippableFact] public unsafe void RoundTripTestBatch() { - RecordBatch batch1 = TestData.CreateSampleRecordBatch(4, createDictionaryArray: true); + // TODO: Enable these once this the version of pyarrow referenced during testing supports them + HashSet unsupported = new HashSet { ArrowTypeId.ListView, ArrowTypeId.BinaryView, ArrowTypeId.StringView }; + RecordBatch batch1 = TestData.CreateSampleRecordBatch(4, excludedTypes: unsupported); RecordBatch batch2 = batch1.Clone(); CArrowArray* cExportArray = CArrowArray.Create(); diff --git a/csharp/test/Apache.Arrow.Tests/TableTests.cs b/csharp/test/Apache.Arrow.Tests/TableTests.cs index d52b514e092d9..83c88265d172b 100644 --- a/csharp/test/Apache.Arrow.Tests/TableTests.cs +++ b/csharp/test/Apache.Arrow.Tests/TableTests.cs @@ -62,7 +62,11 @@ public void TestTableFromRecordBatches() Table table1 = Table.TableFromRecordBatches(recordBatch1.Schema, recordBatches); Assert.Equal(20, table1.RowCount); - Assert.Equal(30, table1.ColumnCount); +#if NET5_0_OR_GREATER + Assert.Equal(35, table1.ColumnCount); +#else + Assert.Equal(34, table1.ColumnCount); +#endif Assert.Equal("ChunkedArray: Length=20, DataType=list", table1.Column(0).Data.ToString()); FixedSizeBinaryType type = new FixedSizeBinaryType(17); diff --git a/csharp/test/Apache.Arrow.Tests/TestData.cs b/csharp/test/Apache.Arrow.Tests/TestData.cs index b43321abd7499..29ddef2864862 100644 --- a/csharp/test/Apache.Arrow.Tests/TestData.cs +++ b/csharp/test/Apache.Arrow.Tests/TestData.cs @@ -24,53 +24,66 @@ namespace Apache.Arrow.Tests { public static class TestData { - public static RecordBatch CreateSampleRecordBatch(int length, bool createDictionaryArray = true) + public static RecordBatch CreateSampleRecordBatch(int length, bool createDictionaryArray) { - return CreateSampleRecordBatch(length, columnSetCount: 1, createDictionaryArray); + HashSet excluded = createDictionaryArray ? null : new HashSet { ArrowTypeId.Dictionary }; + return CreateSampleRecordBatch(length, columnSetCount: 1, excluded); } - public static RecordBatch CreateSampleRecordBatch(int length, int columnSetCount, bool createAdvancedTypeArrays) + public static RecordBatch CreateSampleRecordBatch( + int length, + int columnSetCount = 1, + HashSet excludedTypes = null) { Schema.Builder builder = new Schema.Builder(); - for (int i = 0; i < columnSetCount; i++) + + void AddField(Field field) { - builder.Field(CreateField(new ListType(Int64Type.Default), i)); - builder.Field(CreateField(BooleanType.Default, i)); - builder.Field(CreateField(UInt8Type.Default, i)); - builder.Field(CreateField(Int8Type.Default, i)); - builder.Field(CreateField(UInt16Type.Default, i)); - builder.Field(CreateField(Int16Type.Default, i)); - builder.Field(CreateField(UInt32Type.Default, i)); - builder.Field(CreateField(Int32Type.Default, i)); - builder.Field(CreateField(UInt64Type.Default, i)); - builder.Field(CreateField(Int64Type.Default, i)); - builder.Field(CreateField(FloatType.Default, i)); - builder.Field(CreateField(DoubleType.Default, i)); - builder.Field(CreateField(Date32Type.Default, i)); - builder.Field(CreateField(Date64Type.Default, i)); - builder.Field(CreateField(Time32Type.Default, i)); - builder.Field(CreateField(Time64Type.Default, i)); - builder.Field(CreateField(TimestampType.Default, i)); - builder.Field(CreateField(StringType.Default, i)); - builder.Field(CreateField(new StructType(new List { CreateField(StringType.Default, i), CreateField(Int32Type.Default, i) }), i)); - builder.Field(CreateField(new Decimal128Type(10, 6), i)); - builder.Field(CreateField(new Decimal256Type(16, 8), i)); - builder.Field(CreateField(new MapType(StringType.Default, Int32Type.Default), i)); - builder.Field(CreateField(IntervalType.YearMonth, i)); - builder.Field(CreateField(IntervalType.DayTime, i)); - builder.Field(CreateField(IntervalType.MonthDayNanosecond, i)); - - if (createAdvancedTypeArrays) + if (excludedTypes == null || !excludedTypes.Contains(field.DataType.TypeId)) { - builder.Field(CreateField(new DictionaryType(Int32Type.Default, StringType.Default, false), i)); - builder.Field(CreateField(new FixedSizeBinaryType(16), i)); - builder.Field(CreateField(new FixedSizeListType(Int32Type.Default, 3), i)); - builder.Field(CreateField(new UnionType(new[] { CreateField(StringType.Default, i), CreateField(Int32Type.Default, i) }, new[] { 0, 1 }, UnionMode.Sparse), i)); - builder.Field(CreateField(new UnionType(new[] { CreateField(StringType.Default, i), CreateField(Int32Type.Default, i) }, new[] { 0, 1 }, UnionMode.Dense), -i)); + builder.Field(field); } + } - //builder.Field(CreateField(HalfFloatType.Default)); - //builder.Field(CreateField(StringType.Default)); + for (int i = 0; i < columnSetCount; i++) + { + AddField(CreateField(new ListType(Int64Type.Default), i)); + AddField(CreateField(new ListViewType(Int64Type.Default), i)); + AddField(CreateField(BooleanType.Default, i)); + AddField(CreateField(UInt8Type.Default, i)); + AddField(CreateField(Int8Type.Default, i)); + AddField(CreateField(UInt16Type.Default, i)); + AddField(CreateField(Int16Type.Default, i)); + AddField(CreateField(UInt32Type.Default, i)); + AddField(CreateField(Int32Type.Default, i)); + AddField(CreateField(UInt64Type.Default, i)); + AddField(CreateField(Int64Type.Default, i)); +#if NET5_0_OR_GREATER + AddField(CreateField(HalfFloatType.Default, i)); +#endif + AddField(CreateField(FloatType.Default, i)); + AddField(CreateField(DoubleType.Default, i)); + AddField(CreateField(Date32Type.Default, i)); + AddField(CreateField(Date64Type.Default, i)); + AddField(CreateField(Time32Type.Default, i)); + AddField(CreateField(Time64Type.Default, i)); + AddField(CreateField(TimestampType.Default, i)); + AddField(CreateField(StringType.Default, i)); + AddField(CreateField(StringViewType.Default, i)); + AddField(CreateField(new StructType(new List { CreateField(StringType.Default, i), CreateField(Int32Type.Default, i) }), i)); + AddField(CreateField(new Decimal128Type(10, 6), i)); + AddField(CreateField(new Decimal256Type(16, 8), i)); + AddField(CreateField(new MapType(StringType.Default, Int32Type.Default), i)); + AddField(CreateField(IntervalType.YearMonth, i)); + AddField(CreateField(IntervalType.DayTime, i)); + AddField(CreateField(IntervalType.MonthDayNanosecond, i)); + AddField(CreateField(BinaryType.Default, i)); + AddField(CreateField(BinaryViewType.Default, i)); + AddField(CreateField(new FixedSizeBinaryType(16), i)); + AddField(CreateField(new FixedSizeListType(Int32Type.Default, 3), i)); + AddField(CreateField(new UnionType(new[] { CreateField(StringType.Default, i), CreateField(Int32Type.Default, i) }, new[] { 0, 1 }, UnionMode.Sparse), i)); + AddField(CreateField(new UnionType(new[] { CreateField(StringType.Default, i), CreateField(Int32Type.Default, i) }, new[] { 0, 1 }, UnionMode.Dense), -i)); + AddField(CreateField(new DictionaryType(Int32Type.Default, StringType.Default, false), i)); } Schema schema = builder.Build(); @@ -130,16 +143,23 @@ private class ArrayCreator : IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, +#if NET5_0_OR_GREATER + IArrowTypeVisitor, +#endif IArrowTypeVisitor { private int Length { get; } @@ -160,6 +180,9 @@ public ArrayCreator(int length) public void Visit(UInt32Type type) => GenerateArray(new UInt32Array.Builder(), x => (uint)x); public void Visit(UInt64Type type) => GenerateArray(new UInt64Array.Builder(), x => (ulong)x); public void Visit(FloatType type) => GenerateArray(new FloatArray.Builder(), x => ((float)x / Length)); +#if NET5_0_OR_GREATER + public void Visit(HalfFloatType type) => GenerateArray(new HalfFloatArray.Builder(), x => ((Half)x / (Half)Length)); +#endif public void Visit(DoubleType type) => GenerateArray(new DoubleArray.Builder(), x => ((double)x / Length)); public void Visit(Decimal128Type type) { @@ -277,6 +300,30 @@ public void Visit(StringType type) Array = builder.Build(); } + public void Visit(StringViewType type) + { + var str = "length=ten"; + var builder = new StringViewArray.Builder(); + + for (var i = 0; i < Length; i++) + { + switch (i % 3) + { + case 0: + builder.AppendNull(); + break; + case 1: + builder.Append(str); + break; + case 2: + builder.Append(str + str); + break; + } + } + + Array = builder.Build(); + } + public void Visit(ListType type) { var builder = new ListArray.Builder(type.ValueField).Reserve(Length); @@ -294,6 +341,23 @@ public void Visit(ListType type) Array = builder.Build(); } + public void Visit(ListViewType type) + { + var builder = new ListViewArray.Builder(type.ValueField).Reserve(Length); + + var valueBuilder = (Int64Array.Builder)builder.ValueBuilder.Reserve(Length + 1); + + for (var i = 0; i < Length; i++) + { + builder.Append(); + valueBuilder.Append(i); + } + //Add a value to check if Values.Length can exceed ListArray.Length + valueBuilder.Append(0); + + Array = builder.Build(); + } + public void Visit(FixedSizeListType type) { var builder = new FixedSizeListArray.Builder(type.ValueField, type.ListSize).Reserve(Length); @@ -411,6 +475,64 @@ public void Visit(DictionaryType type) Array = new DictionaryArray(type, indicesBuilder.Build(), valueBuilder.Build()); } + public void Visit(BinaryType type) + { + ReadOnlySpan shortData = new[] { (byte)0, (byte)1, (byte)2, (byte)3, (byte)4, (byte)5, (byte)6, (byte)7, (byte)8, (byte)9 }; + ReadOnlySpan longData = new[] + { + (byte)0, (byte)1, (byte)2, (byte)3, (byte)4, (byte)5, (byte)6, (byte)7, (byte)8, (byte)9, + (byte)10, (byte)11, (byte)12, (byte)13, (byte)14, (byte)15, (byte)16, (byte)17, (byte)18, (byte)19 + }; + var builder = new BinaryArray.Builder(); + + for (var i = 0; i < Length; i++) + { + switch (i % 3) + { + case 0: + builder.AppendNull(); + break; + case 1: + builder.Append(shortData); + break; + case 2: + builder.Append(longData); + break; + } + } + + Array = builder.Build(); + } + + public void Visit(BinaryViewType type) + { + ReadOnlySpan shortData = new[] { (byte)0, (byte)1, (byte)2, (byte)3, (byte)4, (byte)5, (byte)6, (byte)7, (byte)8, (byte)9 }; + ReadOnlySpan longData = new[] + { + (byte)0, (byte)1, (byte)2, (byte)3, (byte)4, (byte)5, (byte)6, (byte)7, (byte)8, (byte)9, + (byte)10, (byte)11, (byte)12, (byte)13, (byte)14, (byte)15, (byte)16, (byte)17, (byte)18, (byte)19 + }; + var builder = new BinaryViewArray.Builder(); + + for (var i = 0; i < Length; i++) + { + switch (i % 3) + { + case 0: + builder.AppendNull(); + break; + case 1: + builder.Append(shortData); + break; + case 2: + builder.Append(longData); + break; + } + } + + Array = builder.Build(); + } + public void Visit(FixedSizeBinaryType type) { ArrowBuffer.Builder valueBuilder = new ArrowBuffer.Builder(); diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index 2bbc843836af9..230ec5b3effff 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -1932,13 +1932,12 @@ def _temp_path(): .skip_tester('Rust'), generate_binary_view_case() - .skip_tester('C#') .skip_tester('Java') .skip_tester('JS') .skip_tester('Rust'), generate_list_view_case() - .skip_tester('C#') + .skip_tester('C#') # Doesn't support large list views .skip_tester('Java') .skip_tester('JS') .skip_tester('Rust'), diff --git a/docs/source/status.rst b/docs/source/status.rst index e860aceb76e15..03a87012342c2 100644 --- a/docs/source/status.rst +++ b/docs/source/status.rst @@ -68,9 +68,13 @@ Data Types +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Large Utf8 | ✓ | ✓ | ✓ | ✓ | | ✓ | ✓ | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| Binary View | ✓ | | ✓ | | | | | | +| Binary View | ✓ | | ✓ | | ✓ | | | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| String View | ✓ | | ✓ | | | | | | +| Large Binary View | ✓ | | ✓ | | | | | | ++-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ +| Utf8 View | ✓ | | ✓ | | ✓ | | | | ++-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ +| Large Utf8 View | ✓ | | ✓ | | | | | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ @@ -83,7 +87,7 @@ Data Types +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Large List | ✓ | ✓ | ✓ | | | ✓ | ✓ | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| List View | ✓ | | ✓ | | | | | | +| List View | ✓ | | ✓ | | ✓ | | | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Large List View | ✓ | | ✓ | | | | | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ From bcaeaa8c2d970b81249cfba019475598e3d3109f Mon Sep 17 00:00:00 2001 From: Curt Hagenlocher Date: Wed, 27 Dec 2023 11:30:16 -0800 Subject: [PATCH 109/570] MINOR: [C#] Remove launchSettings.json (#39382) ### Rationale for this change A previous commit accidentally included a version of launchSettings.json used for local debugging. This file is not helpful to anyone. ### Are these changes tested? N/A ### Are there any user-facing changes? No. Authored-by: Curt Hagenlocher Signed-off-by: Curt Hagenlocher --- .../Properties/launchSettings.json | 8 -------- 1 file changed, 8 deletions(-) delete mode 100644 csharp/test/Apache.Arrow.IntegrationTest/Properties/launchSettings.json diff --git a/csharp/test/Apache.Arrow.IntegrationTest/Properties/launchSettings.json b/csharp/test/Apache.Arrow.IntegrationTest/Properties/launchSettings.json deleted file mode 100644 index 46bdeff290e17..0000000000000 --- a/csharp/test/Apache.Arrow.IntegrationTest/Properties/launchSettings.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "profiles": { - "Apache.Arrow.IntegrationTest": { - "commandName": "Project", - "commandLineArgs": "--mode validate -j C:\\Users\\curt\\AppData\\Local\\Temp\\arrow-integration-9_cov7dz\\generated_binary_view.json -a C:\\Users\\curt\\AppData\\Local\\Temp\\tmpxicbzqpn\\460a151e_generated_binary_view.json_as_file" - } - } -} \ No newline at end of file From 7c3480e2f028f5881242f227f42155cf833efee7 Mon Sep 17 00:00:00 2001 From: mwish Date: Fri, 29 Dec 2023 10:58:12 +0800 Subject: [PATCH 110/570] GH-39326: [C++] Flaky DatasetWriterTestFixture.MaxRowsOneWriteBackpresure test (#39379) ### Rationale for this change This patch reduce the number of open files in testing first. I've verify the test in 14.0.2, it hangs forever. ### What changes are included in this PR? Change the test file number from 100 to 20 ### Are these changes tested? Already ### Are there any user-facing changes? no * Closes: #39326 Authored-by: mwish Signed-off-by: mwish --- cpp/src/arrow/dataset/dataset_writer_test.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/dataset/dataset_writer_test.cc b/cpp/src/arrow/dataset/dataset_writer_test.cc index e62e779f71797..1ac0ec3f39e97 100644 --- a/cpp/src/arrow/dataset/dataset_writer_test.cc +++ b/cpp/src/arrow/dataset/dataset_writer_test.cc @@ -290,12 +290,12 @@ TEST_F(DatasetWriterTestFixture, MaxRowsOneWriteBackpresure) { write_options_.max_open_files = 2; write_options_.min_rows_per_group = kFileSizeLimit - 1; auto dataset_writer = MakeDatasetWriter(/*max_rows=*/kFileSizeLimit); - for (int i = 0; i < 20; ++i) { - dataset_writer->WriteRecordBatch(MakeBatch(kFileSizeLimit * 5), ""); + for (int i = 0; i < 5; ++i) { + dataset_writer->WriteRecordBatch(MakeBatch(kFileSizeLimit * 2), ""); } EndWriterChecked(dataset_writer.get()); std::vector expected_files; - for (int i = 0; i < 100; ++i) { + for (int i = 0; i < 10; ++i) { expected_files.emplace_back("testdir/chunk-" + std::to_string(i) + ".arrow", kFileSizeLimit * i, kFileSizeLimit); } From 8a9f877896644ef1629136e8428a2c21bce64ae3 Mon Sep 17 00:00:00 2001 From: Hyunseok Seo Date: Mon, 1 Jan 2024 22:35:58 +0900 Subject: [PATCH 111/570] GH-39051: [C++] Use Cast() instead of CastTo() for List Scalar in test (#39353) ### Rationale for this change Remove legacy code ### What changes are included in this PR? Replace the legacy scalar CastTo implementation for List Scalar in test. ### Are these changes tested? Yes. It is passed by existing test cases. ### Are there any user-facing changes? No. * Closes: #39051 Authored-by: Hyunseok Seo Signed-off-by: Sutou Kouhei --- .../compute/kernels/scalar_cast_nested.cc | 10 ++++- cpp/src/arrow/scalar_test.cc | 39 ++++++++++++------- 2 files changed, 34 insertions(+), 15 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 6fd449a931381..ec5291ef608a3 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -401,7 +401,7 @@ void AddTypeToTypeCast(CastFunction* func) { kernel.exec = CastFunctor::Exec; kernel.signature = KernelSignature::Make({InputType(SrcT::type_id)}, kOutputTargetType); kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; - DCHECK_OK(func->AddKernel(StructType::type_id, std::move(kernel))); + DCHECK_OK(func->AddKernel(SrcT::type_id, std::move(kernel))); } template @@ -480,14 +480,18 @@ std::vector> GetNestedCasts() { auto cast_list = std::make_shared("cast_list", Type::LIST); AddCommonCasts(Type::LIST, kOutputTargetType, cast_list.get()); AddListCast(cast_list.get()); + AddListCast(cast_list.get()); AddListCast(cast_list.get()); + AddListCast(cast_list.get()); AddTypeToTypeCast, FixedSizeListType>(cast_list.get()); auto cast_large_list = std::make_shared("cast_large_list", Type::LARGE_LIST); AddCommonCasts(Type::LARGE_LIST, kOutputTargetType, cast_large_list.get()); AddListCast(cast_large_list.get()); + AddListCast(cast_large_list.get()); AddListCast(cast_large_list.get()); + AddListCast(cast_large_list.get()); AddTypeToTypeCast, FixedSizeListType>( cast_large_list.get()); @@ -503,7 +507,11 @@ std::vector> GetNestedCasts() { AddCommonCasts(Type::FIXED_SIZE_LIST, kOutputTargetType, cast_fsl.get()); AddTypeToTypeCast(cast_fsl.get()); AddTypeToTypeCast, ListType>(cast_fsl.get()); + AddTypeToTypeCast, ListViewType>(cast_fsl.get()); AddTypeToTypeCast, LargeListType>(cast_fsl.get()); + AddTypeToTypeCast, LargeListViewType>( + cast_fsl.get()); + AddTypeToTypeCast, MapType>(cast_fsl.get()); // So is struct auto cast_struct = std::make_shared("cast_struct", Type::STRUCT); diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc index ac740f92c8527..e8b8784e7a314 100644 --- a/cpp/src/arrow/scalar_test.cc +++ b/cpp/src/arrow/scalar_test.cc @@ -1077,7 +1077,8 @@ std::shared_ptr MakeListType( template void CheckListCast(const ScalarType& scalar, const std::shared_ptr& to_type) { - EXPECT_OK_AND_ASSIGN(auto cast_scalar, scalar.CastTo(to_type)); + EXPECT_OK_AND_ASSIGN(auto cast_scalar_datum, Cast(scalar, to_type)); + const auto& cast_scalar = cast_scalar_datum.scalar(); ASSERT_OK(cast_scalar->ValidateFull()); ASSERT_EQ(*cast_scalar->type, *to_type); @@ -1087,11 +1088,25 @@ void CheckListCast(const ScalarType& scalar, const std::shared_ptr& to *checked_cast(*cast_scalar).value); } -void CheckInvalidListCast(const Scalar& scalar, const std::shared_ptr& to_type, - const std::string& expected_message) { - EXPECT_RAISES_WITH_CODE_AND_MESSAGE_THAT(StatusCode::Invalid, - ::testing::HasSubstr(expected_message), - scalar.CastTo(to_type)); +template +void CheckListCastError(const ScalarType& scalar, + const std::shared_ptr& to_type) { + StatusCode code; + std::string expected_message; + if (scalar.type->id() == Type::FIXED_SIZE_LIST) { + code = StatusCode::TypeError; + expected_message = + "Size of FixedSizeList is not the same. input list: " + scalar.type->ToString() + + " output list: " + to_type->ToString(); + } else { + code = StatusCode::Invalid; + expected_message = + "ListType can only be casted to FixedSizeListType if the lists are all the " + "expected size."; + } + + EXPECT_RAISES_WITH_CODE_AND_MESSAGE_THAT(code, ::testing::HasSubstr(expected_message), + Cast(scalar, to_type)); } template @@ -1178,10 +1193,8 @@ class TestListLikeScalar : public ::testing::Test { CheckListCast( scalar, fixed_size_list(value_->type(), static_cast(value_->length()))); - CheckInvalidListCast(scalar, fixed_size_list(value_->type(), 5), - "Cannot cast " + scalar.type->ToString() + " of length " + - std::to_string(value_->length()) + - " to fixed size list of length 5"); + auto invalid_cast_type = fixed_size_list(value_->type(), 5); + CheckListCastError(scalar, invalid_cast_type); } protected: @@ -1238,10 +1251,8 @@ TEST(TestMapScalar, Cast) { CheckListCast(scalar, large_list(key_value_type)); CheckListCast(scalar, fixed_size_list(key_value_type, 2)); - CheckInvalidListCast(scalar, fixed_size_list(key_value_type, 5), - "Cannot cast " + scalar.type->ToString() + " of length " + - std::to_string(value->length()) + - " to fixed size list of length 5"); + auto invalid_cast_type = fixed_size_list(key_value_type, 5); + CheckListCastError(scalar, invalid_cast_type); } TEST(TestStructScalar, FieldAccess) { From 13696304089217c7c1c9b84c497318f506eee67b Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Mon, 1 Jan 2024 22:36:37 +0900 Subject: [PATCH 112/570] GH-39359: [CI][C++] Remove MinGW MINGW32 C++ job (#39376) ### Rationale for this change MSYS2 stopped providing MINGW32 packages: * https://github.com/msys2/MINGW-packages/pull/19517 * https://github.com/msys2/MINGW-packages/commit/f68162d5827fce41e7c2d4eb65cab6fcd8b9dd60 ### What changes are included in this PR? Remove the job. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * Closes: #39359 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- .github/workflows/cpp.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index 2e3c2a355a884..3d4fb10b10c39 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -340,8 +340,6 @@ jobs: fail-fast: false matrix: include: - - msystem_lower: mingw32 - msystem_upper: MINGW32 - msystem_lower: mingw64 msystem_upper: MINGW64 - msystem_lower: clang64 From 4543f5d8394e221681c362f4e7c8a7268823b2cd Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Mon, 1 Jan 2024 22:38:24 +0900 Subject: [PATCH 113/570] GH-39268: [C++] Don't install bundled Azure SDK for C++ with CMake 3.28+ (#39269) ### Rationale for this change We can implement this by specifying `EXCLUDE_FROM_ALL TRUE` to `fetchcontent_declare()`. ### What changes are included in this PR? Specify `EXCLUDE_FROM_ALL TRUE` only with CMake 3.28+. ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. * Closes: #39268 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 89d046945e5fe..3f327ed64ff00 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -1015,6 +1015,10 @@ else() endif() include(FetchContent) +set(FC_DECLARE_COMMON_OPTIONS) +if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.28) + list(APPEND FC_DECLARE_COMMON_OPTIONS EXCLUDE_FROM_ALL TRUE) +endif() macro(prepare_fetchcontent) set(BUILD_SHARED_LIBS OFF) @@ -2146,6 +2150,9 @@ function(build_gtest) message(STATUS "Building gtest from source") set(GTEST_VENDORED TRUE) fetchcontent_declare(googletest + # We should not specify "EXCLUDE_FROM_ALL TRUE" here. + # Because we install GTest with custom path. + # ${FC_DECLARE_COMMON_OPTIONS} URL ${GTEST_SOURCE_URL} URL_HASH "SHA256=${ARROW_GTEST_BUILD_SHA256_CHECKSUM}") prepare_fetchcontent() @@ -5096,8 +5103,7 @@ function(build_azure_sdk) endif() message(STATUS "Building Azure SDK for C++ from source") fetchcontent_declare(azure_sdk - # EXCLUDE_FROM_ALL is available since CMake 3.28 - # EXCLUDE_FROM_ALL TRUE + ${FC_DECLARE_COMMON_OPTIONS} URL ${ARROW_AZURE_SDK_URL} URL_HASH "SHA256=${ARROW_AZURE_SDK_BUILD_SHA256_CHECKSUM}") prepare_fetchcontent() From 3087c941699ea8485de619b8a36d98322fe20aa0 Mon Sep 17 00:00:00 2001 From: shibei Date: Tue, 2 Jan 2024 09:23:56 +0800 Subject: [PATCH 114/570] GH-39387: [C++] Fix compile warning (#39389) ### Rationale for this change Fix compile warning: ```bash In file included from /workspace/arrow/cpp/src/arrow/array/array_base.h:26: /workspace/arrow/cpp/src/arrow/array/data.h:452:19: warning: unused variable 'buffer_length' [-Wunused-variable] const int64_t buffer_length = buffers[i].size / static_cast(sizeof(T)); ^ /workspace/arrow/cpp/src/arrow/array/data.h:467:19: warning: unused variable 'buffer_length' [-Wunused-variable] const int64_t buffer_length = buffers[i].size / static_cast(sizeof(T)); ^ 2 warnings generated. ``` ### What changes are included in this PR? ### Are these changes tested? ### Are there any user-facing changes? * Closes: #39387 Authored-by: shibei Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/array/data.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h index f29f164d19973..edd443adc43c4 100644 --- a/cpp/src/arrow/array/data.h +++ b/cpp/src/arrow/array/data.h @@ -451,6 +451,7 @@ struct ARROW_EXPORT ArraySpan { util::span GetSpan(int i, int64_t length) const { const int64_t buffer_length = buffers[i].size / static_cast(sizeof(T)); assert(i > 0 && length + offset <= buffer_length); + ARROW_UNUSED(buffer_length); return util::span(buffers[i].data_as() + this->offset, length); } @@ -466,6 +467,7 @@ struct ARROW_EXPORT ArraySpan { util::span GetSpan(int i, int64_t length) { const int64_t buffer_length = buffers[i].size / static_cast(sizeof(T)); assert(i > 0 && length + offset <= buffer_length); + ARROW_UNUSED(buffer_length); return util::span(buffers[i].mutable_data_as() + this->offset, length); } From 98f677af3c281680b95093ceeab084b3e57e180a Mon Sep 17 00:00:00 2001 From: Hattonuri <53221537+Hattonuri@users.noreply.github.com> Date: Tue, 2 Jan 2024 07:35:48 +0300 Subject: [PATCH 115/570] GH-39413: [C++][Parquet] Vectorize decode plain on FLBA (#39414) ### Rationale for this change ### What changes are included in this PR? FLBA Decode Plain is not vectorized. So this parsing can be implemented faster https://godbolt.org/z/xWeb93xjW ### Are these changes tested? Yes, on unittest ### Are there any user-facing changes? * Closes: #39413 Authored-by: Dmitry Stasenko Signed-off-by: mwish --- cpp/src/parquet/encoding.cc | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 9ad1ee6efc12a..840efa12cc3c1 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1080,9 +1080,7 @@ inline int DecodePlain(const uint8_t* data, int64_t data_size ParquetException::EofException(); } for (int i = 0; i < num_values; ++i) { - out[i].ptr = data; - data += type_length; - data_size -= type_length; + out[i].ptr = data + i * type_length; } return static_cast(bytes_to_decode); } From fc20cd002817d62158cfa4cf4e096f29c3fce5da Mon Sep 17 00:00:00 2001 From: mwish Date: Tue, 2 Jan 2024 16:07:09 +0800 Subject: [PATCH 116/570] MINOR: [Docs] update date in NOTICE.txt (#39418) ### Rationale for this change Update Date from 2019 to 2024 in `NOTICE.txt` ### What changes are included in this PR? Update Date from 2019 to 2024 in `NOTICE.txt` ### Are these changes tested? no ### Are there any user-facing changes? no Authored-by: mwish Signed-off-by: Sutou Kouhei --- NOTICE.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NOTICE.txt b/NOTICE.txt index a609791374c28..2089c6fb20358 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -1,5 +1,5 @@ Apache Arrow -Copyright 2016-2019 The Apache Software Foundation +Copyright 2016-2024 The Apache Software Foundation This product includes software developed at The Apache Software Foundation (http://www.apache.org/). From eef2f76ec0f80d3bad7f54c4690465eb3df011f3 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 2 Jan 2024 10:19:36 -0500 Subject: [PATCH 117/570] MINOR: Bump org.apache.avro:avro from 1.8.2 to 1.11.3 in /java/dataset (#39401) Bumps org.apache.avro:avro from 1.8.2 to 1.11.3. [![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=org.apache.avro:avro&package-manager=maven&previous-version=1.8.2&new-version=1.11.3)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
    Dependabot commands and options
    You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself) You can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/apache/arrow/network/alerts).
    Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: David Li --- java/dataset/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/dataset/pom.xml b/java/dataset/pom.xml index b533a1733521b..7d6092743bf4d 100644 --- a/java/dataset/pom.xml +++ b/java/dataset/pom.xml @@ -27,7 +27,7 @@ ../../../cpp/release-build/ 2.5.0 1.11.0 - 1.8.2 + 1.11.3 From 6b32b6d5ad5c4a519111086277f231b654c96056 Mon Sep 17 00:00:00 2001 From: david dali susanibar arce Date: Tue, 2 Jan 2024 10:20:25 -0500 Subject: [PATCH 118/570] GH-39327: [Java] define assemble descriptor for new custom maven plugin project (#39331) ### Rationale for this change To closes https://github.com/apache/arrow/issues/39327 ### What changes are included in this PR? GitHub CI validation needs to [run](https://github.com/apache/arrow/blob/main/ci/scripts/java_full_build.sh#L52) `assembly:single` for that reason is needed to setup a descriptor ref. In the case of this maven plugin, I only propose to include "src" as part of the resources. ### Are these changes tested? Yes, by ```` mvn clean \ install \ assembly:single \ source:jar \ javadoc:jar \ -Papache-release \ -DdescriptorId=source-release ```` ### Are there any user-facing changes? No. * Closes: #39327 Lead-authored-by: david dali susanibar arce Co-authored-by: Sutou Kouhei Signed-off-by: David Li --- java/maven/pom.xml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/java/maven/pom.xml b/java/maven/pom.xml index 86ac402732bc4..0923984c8e5e5 100644 --- a/java/maven/pom.xml +++ b/java/maven/pom.xml @@ -281,6 +281,27 @@
    + + + org.apache.maven.plugins + maven-assembly-plugin + + + package + + single + + + + + + src + + + From 2f63ab9daf9236e8634e12126add0373688adc80 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 2 Jan 2024 10:47:46 -0500 Subject: [PATCH 119/570] MINOR: [Java] Bump com.google.guava:guava-bom from 32.1.3-jre to 33.0.0-jre in /java (#39411) Bumps [com.google.guava:guava-bom](https://github.com/google/guava) from 32.1.3-jre to 33.0.0-jre.
    Release notes

    Sourced from com.google.guava:guava-bom's releases.

    33.0.0

    Maven

    <dependency>
      <groupId>com.google.guava</groupId>
      <artifactId>guava</artifactId>
      <version>33.0.0-jre</version>
      <!-- or, for Android: -->
      <version>33.0.0-android</version>
    </dependency>
    

    Jar files

    Guava requires one runtime dependency, which you can download here:

    Javadoc

    JDiff

    Changelog

    • This version of guava-android contains some package-private methods whose signature includes the Java 8 Collector API. This is a test to identify any problems before we expose those methods publicly to users. Please report any problems that you encounter. (73dbf7ef26)
    • Changed various classes to catch Exception instead of RuntimeException even when only RuntimeException is theoretically possible. This can help code that throws undeclared exceptions, as some bytecode rewriters (e.g., Robolectric) and languages (e.g., Kotlin) do. (c294c23760, 747924e, b2baf48)
    • Added an Automatic-Module-Name to failureaccess, Guava's one strong runtime dependency. (280b5d2f60)
    • reflect: In guava-android only, removed Invokable.getAnnotatedReturnType() and Parameter.getAnnotatedType(). These methods never worked in an Android VM, and to reflect that, they were born @ Deprecated, @ Beta, and @ DoNotCall. They're now preventing us from rolling out some new Android compatibility testing. This is the only binary-incompatible change in this release, and it should have no effect in practice. Still, we bump the major version number to follow Semantic Versioning. (045cd8428f)
    • util.concurrent: Changed our implementations to avoid eagerly initializing loggers during class loading. This can help performance, especially under Android. (4fe1df56bd)
    Commits

    [![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=com.google.guava:guava-bom&package-manager=maven&previous-version=32.1.3-jre&new-version=33.0.0-jre)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
    Dependabot commands and options
    You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
    Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: David Li --- java/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/pom.xml b/java/pom.xml index 523e5642720cd..522ee4abc7669 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -32,7 +32,7 @@ 1.9.0 5.10.1 2.0.9 - 32.1.3-jre + 33.0.0-jre 4.1.104.Final 1.60.0 3.23.1 From 984eb3838e853a6a862678fb3faed907cd3d05eb Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Tue, 2 Jan 2024 12:14:14 -0800 Subject: [PATCH 120/570] GH-39430: [C++][ORC] Upgrade ORC to 1.9.2 (#39431) ### Rationale for this change This PR aims to bring the latest bug fixes - https://orc.apache.org/news/2023/11/10/ORC-1.9.2/ - [ORC-1525 Fix bad read in RleDecoderV2::readByte](https://issues.apache.org/jira/browse/ORC-1525) - https://orc.apache.org/news/2023/08/16/ORC-1.9.1/ - [ORC-1462 Bump aircompressor to 0.25 to fix JDK-8081450](https://issues.apache.org/jira/browse/ORC-1462) ### What changes are included in this PR? This PR upgrades ORC dependency from 1.9.0 to 1.9.2. ### Are these changes tested? Pass the CIs. ### Are there any user-facing changes? No. * Closes: #39430 Authored-by: Dongjoon Hyun Signed-off-by: David Li --- cpp/thirdparty/versions.txt | 4 ++-- java/adapter/orc/pom.xml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt index 01cb836ea2a86..e9df0c8d7566b 100644 --- a/cpp/thirdparty/versions.txt +++ b/cpp/thirdparty/versions.txt @@ -90,8 +90,8 @@ ARROW_OPENTELEMETRY_BUILD_VERSION=v1.8.1 ARROW_OPENTELEMETRY_BUILD_SHA256_CHECKSUM=3d640201594b07f08dade9cd1017bd0b59674daca26223b560b9bb6bf56264c2 ARROW_OPENTELEMETRY_PROTO_BUILD_VERSION=v0.17.0 ARROW_OPENTELEMETRY_PROTO_BUILD_SHA256_CHECKSUM=f269fbcb30e17b03caa1decd231ce826e59d7651c0f71c3b28eb5140b4bb5412 -ARROW_ORC_BUILD_VERSION=1.9.0 -ARROW_ORC_BUILD_SHA256_CHECKSUM=0dca8bbccdb2ee87e59ba964933436beebd02ea78c4134424828a8127fbc4faa +ARROW_ORC_BUILD_VERSION=1.9.2 +ARROW_ORC_BUILD_SHA256_CHECKSUM=7f46f2c184ecefd6791f1a53fb062286818bd8710c3f08b94dd3cac365e240ee ARROW_PROTOBUF_BUILD_VERSION=v21.3 ARROW_PROTOBUF_BUILD_SHA256_CHECKSUM=2f723218f6cb709ae4cdc4fb5ed56a5951fc5d466f0128ce4c946b8c78c8c49f # Because of https://github.com/Tencent/rapidjson/pull/1323, we require diff --git a/java/adapter/orc/pom.xml b/java/adapter/orc/pom.xml index 803ae5a33826f..a42a458e2072a 100644 --- a/java/adapter/orc/pom.xml +++ b/java/adapter/orc/pom.xml @@ -34,7 +34,7 @@ org.apache.orc orc-core - 1.9.0 + 1.9.2 test From 3d1324e86231fbf6799ba5ea22604072857776b1 Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Wed, 3 Jan 2024 10:53:00 +0200 Subject: [PATCH 121/570] GH-39255: [JS] Allow customization of schema when passing vectors to table constructor (#39256) Merge after #39254. * Closes: #39255 --- js/src/builder/largebinary.ts | 2 +- js/src/table.ts | 6 ++++-- js/test/unit/table-tests.ts | 17 +++++++++++++++++ 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/js/src/builder/largebinary.ts b/js/src/builder/largebinary.ts index 59aa7144d20a1..f737349ac1c49 100644 --- a/js/src/builder/largebinary.ts +++ b/js/src/builder/largebinary.ts @@ -24,7 +24,7 @@ import { VariableWidthBuilder, BuilderOptions } from '../builder.js'; export class LargeBinaryBuilder extends VariableWidthBuilder { constructor(opts: BuilderOptions) { super(opts); - this._values = new BufferBuilder(new Uint8Array(0)); + this._values = new BufferBuilder(Uint8Array); } public get byteLength(): number { let size = this._pendingLength + (this.length * 4); diff --git a/js/src/table.ts b/js/src/table.ts index 58518257b30cb..00f4a4cfe0a14 100644 --- a/js/src/table.ts +++ b/js/src/table.ts @@ -73,6 +73,8 @@ export class Table { constructor(...batches: readonly RecordBatch[]); constructor(...columns: { [P in keyof T]: Vector }[]); constructor(...columns: { [P in keyof T]: Data | DataProps }[]); + constructor(schema: Schema, ...columns: { [P in keyof T]: Vector }[]); + constructor(schema: Schema, ...columns: { [P in keyof T]: Data | DataProps }[]); constructor(schema: Schema, data?: RecordBatch | RecordBatch[]); constructor(schema: Schema, data?: RecordBatch | RecordBatch[], offsets?: Uint32Array); constructor(...args: any[]) { @@ -112,8 +114,8 @@ export class Table { } else if (typeof x === 'object') { const keys = Object.keys(x) as (keyof T)[]; const vecs = keys.map((k) => new Vector([x[k]])); - const schema = new Schema(keys.map((k, i) => new Field(String(k), vecs[i].type, vecs[i].nullCount > 0))); - const [, batches] = distributeVectorsIntoRecordBatches(schema, vecs); + const batchSchema = schema ?? new Schema(keys.map((k, i) => new Field(String(k), vecs[i].type, vecs[i].nullCount > 0))); + const [, batches] = distributeVectorsIntoRecordBatches(batchSchema, vecs); return batches.length === 0 ? [new RecordBatch(x)] : batches; } } diff --git a/js/test/unit/table-tests.ts b/js/test/unit/table-tests.ts index 6b34124abcaba..094988c052b6e 100644 --- a/js/test/unit/table-tests.ts +++ b/js/test/unit/table-tests.ts @@ -151,6 +151,23 @@ describe(`Table`, () => { expect(i32).toEqualVector(makeVector(i32s)); }); + test(`creates a new Table from a Typed Array and force nullable`, () => { + const i32s = new Int32Array(arange(new Array(10))); + const i32 = makeVector([i32s]); + expect(i32).toHaveLength(i32s.length); + expect(i32.nullCount).toBe(0); + + const table = new Table(new Schema([new Field('i32', new Int32, true)]), { i32 }); + const i32Field = table.schema.fields[0]; + + expect(i32Field.name).toBe('i32'); + expect(i32).toHaveLength(i32s.length); + expect(i32Field.nullable).toBe(true); + expect(i32.nullCount).toBe(0); + + expect(i32).toEqualVector(makeVector(i32s)); + }); + test(`creates a new Table from Typed Arrays`, () => { const i32s = new Int32Array(arange(new Array(10))); const f32s = new Float32Array(arange(new Array(10))); From d75269f9ee85f5dea736192fdef9f831cb518879 Mon Sep 17 00:00:00 2001 From: John Date: Wed, 3 Jan 2024 17:35:41 +0800 Subject: [PATCH 122/570] MINOR: [Docs] Add an empty line to make `.. code-block::` work correctly (#39388) ### Rationale for this change Code block [here](https://arrow.apache.org/docs/developers/java/development.html#unit-testing) didn't work correctly. Added a empty line to make it work well. ### What changes are included in this PR? Added a empty line to make it work correctly. ### Are these changes tested? No. ### Are there any user-facing changes? No. Authored-by: John Signed-off-by: AlenkaF --- docs/source/developers/java/development.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/developers/java/development.rst b/docs/source/developers/java/development.rst index f7b19d73da2e2..261cd5702ae07 100644 --- a/docs/source/developers/java/development.rst +++ b/docs/source/developers/java/development.rst @@ -42,6 +42,7 @@ Unit Testing Unit tests are run by Maven during the build. To speed up the build, you can skip them by passing -DskipTests. + .. code-block:: $ cd arrow/java From fe38d0e1ee16662e66784f715c2e8179855ee803 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Wed, 3 Jan 2024 11:34:53 +0100 Subject: [PATCH 123/570] GH-39425: [CI] Fix import to match new substrait repo structure (#39426) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Upstream substrait repo did a small refactor. We have to update our imports to match the new structure. ### What changes are included in this PR? Update import ### Are these changes tested? Via archery ### Are there any user-facing changes? No * Closes: #39425 Authored-by: Raúl Cumplido Signed-off-by: Raúl Cumplido --- ci/scripts/integration_substrait.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/scripts/integration_substrait.sh b/ci/scripts/integration_substrait.sh index f7208ae113814..164f0e80b9890 100755 --- a/ci/scripts/integration_substrait.sh +++ b/ci/scripts/integration_substrait.sh @@ -24,7 +24,7 @@ set -e echo "Substrait Integration Tests" echo "Validating imports" python -c "import pyarrow.substrait" -python -c "from substrait_consumer.consumers import AceroConsumer" +python -c "from substrait_consumer.consumers.acero_consumer import AceroConsumer" echo "Executing pytest" cd consumer-testing From 213cadbbc080399b372291f93aaaa05fe0e67de1 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Wed, 3 Jan 2024 11:29:15 -0500 Subject: [PATCH 124/570] GH-38458: [Go] Add ValueLen to BinaryLike interface (#39242) ### Rationale for this change Adding `ValueLen` to the `BinaryLike` interface for easy convenience of determining the length of an individual value for a Binary/String like array. ### Are these changes tested? yes * Closes: #38458 Authored-by: Matt Topol Signed-off-by: Matt Topol --- go/arrow/array/binary.go | 9 +++++++++ go/arrow/array/string.go | 17 +++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/go/arrow/array/binary.go b/go/arrow/array/binary.go index c226297da04c6..9e26de7a6d820 100644 --- a/go/arrow/array/binary.go +++ b/go/arrow/array/binary.go @@ -30,6 +30,7 @@ import ( type BinaryLike interface { arrow.Array + ValueLen(int) int ValueBytes() []byte ValueOffset64(int) int64 } @@ -367,6 +368,11 @@ func (a *BinaryView) Value(i int) []byte { return buf.Bytes()[start : start+int32(s.Len())] } +func (a *BinaryView) ValueLen(i int) int { + s := a.ValueHeader(i) + return s.Len() +} + // ValueString returns the value at index i as a string instead of // a byte slice, without copying the underlying data. func (a *BinaryView) ValueString(i int) string { @@ -441,4 +447,7 @@ var ( _ arrow.Array = (*Binary)(nil) _ arrow.Array = (*LargeBinary)(nil) _ arrow.Array = (*BinaryView)(nil) + + _ BinaryLike = (*Binary)(nil) + _ BinaryLike = (*LargeBinary)(nil) ) diff --git a/go/arrow/array/string.go b/go/arrow/array/string.go index 90a4628f0d0fb..c8517ba3056df 100644 --- a/go/arrow/array/string.go +++ b/go/arrow/array/string.go @@ -31,6 +31,7 @@ import ( type StringLike interface { arrow.Array Value(int) string + ValueLen(int) int } // String represents an immutable sequence of variable-length UTF-8 strings. @@ -225,6 +226,14 @@ func (a *LargeString) ValueOffset64(i int) int64 { return a.ValueOffset(i) } +func (a *LargeString) ValueLen(i int) int { + if i < 0 || i >= a.array.data.length { + panic("arrow/array: index out of range") + } + beg := a.array.data.offset + i + return int(a.offsets[beg+1] - a.offsets[beg]) +} + func (a *LargeString) ValueOffsets() []int64 { beg := a.array.data.offset end := beg + a.array.data.length + 1 @@ -364,6 +373,11 @@ func (a *StringView) Value(i int) string { return *(*string)(unsafe.Pointer(&value)) } +func (a *StringView) ValueLen(i int) int { + s := a.ValueHeader(i) + return s.Len() +} + func (a *StringView) String() string { var o strings.Builder o.WriteString("[") @@ -698,4 +712,7 @@ var ( _ StringLikeBuilder = (*StringBuilder)(nil) _ StringLikeBuilder = (*LargeStringBuilder)(nil) _ StringLikeBuilder = (*StringViewBuilder)(nil) + _ StringLike = (*String)(nil) + _ StringLike = (*LargeString)(nil) + _ StringLike = (*StringView)(nil) ) From 0e597ab1ac62f12a4cf020994b2097643fdb9657 Mon Sep 17 00:00:00 2001 From: LucasG0 <44552904+LucasG0@users.noreply.github.com> Date: Thu, 4 Jan 2024 00:12:24 +0100 Subject: [PATCH 125/570] GH-34316: [Python] FixedSizeListArray.from_arrays supports mask parameter (#39396) ### What changes are included in this PR? Add `mask` / `null_bitmap` parameters in corresponding Cython / C++ `FixedSizeListArray` methods, and propagate this bitmap instead of using the current dummy `validity_buf`. ### Are these changes tested? Yes ### Are there any user-facing changes? Yes, `mask` parameter has been added to `FixedSizeListArray.from_arrays` * Closes: #34316 Authored-by: LucasG0 Signed-off-by: Will Jones --- cpp/src/arrow/array/array_nested.cc | 16 ++++++++-------- cpp/src/arrow/array/array_nested.h | 16 ++++++++++++---- python/pyarrow/array.pxi | 13 +++++++++---- python/pyarrow/includes/libarrow.pxd | 8 ++++++-- python/pyarrow/tests/test_array.py | 10 ++++++++++ 5 files changed, 45 insertions(+), 18 deletions(-) diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index acdd0a0742468..0b0e340a67d4e 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -894,7 +894,8 @@ const std::shared_ptr& FixedSizeListArray::value_type() const { const std::shared_ptr& FixedSizeListArray::values() const { return values_; } Result> FixedSizeListArray::FromArrays( - const std::shared_ptr& values, int32_t list_size) { + const std::shared_ptr& values, int32_t list_size, + std::shared_ptr null_bitmap, int64_t null_count) { if (list_size <= 0) { return Status::Invalid("list_size needs to be a strict positive integer"); } @@ -905,14 +906,14 @@ Result> FixedSizeListArray::FromArrays( } int64_t length = values->length() / list_size; auto list_type = std::make_shared(values->type(), list_size); - std::shared_ptr validity_buf; - return std::make_shared(list_type, length, values, validity_buf, - /*null_count=*/0, /*offset=*/0); + return std::make_shared(list_type, length, values, null_bitmap, + null_count); } Result> FixedSizeListArray::FromArrays( - const std::shared_ptr& values, std::shared_ptr type) { + const std::shared_ptr& values, std::shared_ptr type, + std::shared_ptr null_bitmap, int64_t null_count) { if (type->id() != Type::FIXED_SIZE_LIST) { return Status::TypeError("Expected fixed size list type, got ", type->ToString()); } @@ -926,10 +927,9 @@ Result> FixedSizeListArray::FromArrays( "The length of the values Array needs to be a multiple of the list size"); } int64_t length = values->length() / list_type.list_size(); - std::shared_ptr validity_buf; - return std::make_shared(type, length, values, validity_buf, - /*null_count=*/0, /*offset=*/0); + return std::make_shared(type, length, values, null_bitmap, + null_count); } Result> FixedSizeListArray::Flatten( diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h index 61606e1592d61..768a630e0af54 100644 --- a/cpp/src/arrow/array/array_nested.h +++ b/cpp/src/arrow/array/array_nested.h @@ -599,17 +599,25 @@ class ARROW_EXPORT FixedSizeListArray : public Array { /// /// \param[in] values Array containing list values /// \param[in] list_size The fixed length of each list + /// \param[in] null_bitmap Optional validity bitmap + /// \param[in] null_count Optional null count in null_bitmap /// \return Will have length equal to values.length() / list_size - static Result> FromArrays(const std::shared_ptr& values, - int32_t list_size); + static Result> FromArrays( + const std::shared_ptr& values, int32_t list_size, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); /// \brief Construct FixedSizeListArray from child value array and type /// /// \param[in] values Array containing list values /// \param[in] type The fixed sized list type + /// \param[in] null_bitmap Optional validity bitmap + /// \param[in] null_count Optional null count in null_bitmap /// \return Will have length equal to values.length() / type.list_size() - static Result> FromArrays(const std::shared_ptr& values, - std::shared_ptr type); + static Result> FromArrays( + const std::shared_ptr& values, std::shared_ptr type, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); protected: void SetData(const std::shared_ptr& data); diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 74a196002bfa6..751dfbcce4342 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -2484,7 +2484,7 @@ cdef class MapArray(ListArray): Examples -------- - First, let's understand the structure of our dataset when viewed in a rectangular data model. + First, let's understand the structure of our dataset when viewed in a rectangular data model. The total of 5 respondents answered the question "How much did you like the movie x?". The value -1 in the integer array means that the value is missing. The boolean array represents the null bitmask corresponding to the missing values in the integer array. @@ -2590,7 +2590,7 @@ cdef class FixedSizeListArray(BaseListArray): """ @staticmethod - def from_arrays(values, list_size=None, DataType type=None): + def from_arrays(values, list_size=None, DataType type=None, mask=None): """ Construct FixedSizeListArray from array of values and a list length. @@ -2602,6 +2602,9 @@ cdef class FixedSizeListArray(BaseListArray): type : DataType, optional If not specified, a default ListType with the values' type and `list_size` length is used. + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + Returns ------- @@ -2652,19 +2655,21 @@ cdef class FixedSizeListArray(BaseListArray): _values = asarray(values) + c_mask = c_mask_inverted_from_obj(mask, None) + if type is not None: if list_size is not None: raise ValueError("Cannot specify both list_size and type") with nogil: c_result = CFixedSizeListArray.FromArraysAndType( - _values.sp_array, type.sp_type) + _values.sp_array, type.sp_type, c_mask) else: if list_size is None: raise ValueError("Should specify one of list_size and type") _list_size = list_size with nogil: c_result = CFixedSizeListArray.FromArrays( - _values.sp_array, _list_size) + _values.sp_array, _list_size, c_mask) cdef Array result = pyarrow_wrap_array(GetResultValue(c_result)) result.validate() return result diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index bad5ec606c268..82b888f584813 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -673,11 +673,15 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CFixedSizeListArray" arrow::FixedSizeListArray"(CArray): @staticmethod CResult[shared_ptr[CArray]] FromArrays( - const shared_ptr[CArray]& values, int32_t list_size) + const shared_ptr[CArray]& values, + int32_t list_size, + shared_ptr[CBuffer] null_bitmap) @staticmethod CResult[shared_ptr[CArray]] FromArraysAndType" FromArrays"( - const shared_ptr[CArray]& values, shared_ptr[CDataType]) + const shared_ptr[CArray]& values, + shared_ptr[CDataType], + shared_ptr[CBuffer] null_bitmap) int64_t value_offset(int i) int64_t value_length(int i) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 599d15d023a55..d598630dc2103 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -1091,6 +1091,16 @@ def test_fixed_size_list_from_arrays(): assert result.type.equals(typ) assert result.type.value_field.name == "name" + result = pa.FixedSizeListArray.from_arrays(values, + type=typ, + mask=pa.array([False, True, False])) + assert result.to_pylist() == [[0, 1, 2, 3], None, [8, 9, 10, 11]] + + result = pa.FixedSizeListArray.from_arrays(values, + list_size=4, + mask=pa.array([False, True, False])) + assert result.to_pylist() == [[0, 1, 2, 3], None, [8, 9, 10, 11]] + # raise on invalid values / list_size with pytest.raises(ValueError): pa.FixedSizeListArray.from_arrays(values, -4) From 5c0fa712faec0b2997b5970890c076011f96de77 Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Thu, 4 Jan 2024 03:12:04 +0200 Subject: [PATCH 126/570] GH-39435: [JS] Add Vector.nullable (#39436) --- js/src/table.ts | 2 +- js/src/util/chunk.ts | 5 +++++ js/src/vector.ts | 8 ++++++++ js/test/unit/table-tests.ts | 18 ++++++++++-------- 4 files changed, 24 insertions(+), 9 deletions(-) diff --git a/js/src/table.ts b/js/src/table.ts index 00f4a4cfe0a14..e719b7ca9d313 100644 --- a/js/src/table.ts +++ b/js/src/table.ts @@ -114,7 +114,7 @@ export class Table { } else if (typeof x === 'object') { const keys = Object.keys(x) as (keyof T)[]; const vecs = keys.map((k) => new Vector([x[k]])); - const batchSchema = schema ?? new Schema(keys.map((k, i) => new Field(String(k), vecs[i].type, vecs[i].nullCount > 0))); + const batchSchema = schema ?? new Schema(keys.map((k, i) => new Field(String(k), vecs[i].type, vecs[i].nullable))); const [, batches] = distributeVectorsIntoRecordBatches(batchSchema, vecs); return batches.length === 0 ? [new RecordBatch(x)] : batches; } diff --git a/js/src/util/chunk.ts b/js/src/util/chunk.ts index 6098b04243422..36620627f197d 100644 --- a/js/src/util/chunk.ts +++ b/js/src/util/chunk.ts @@ -51,6 +51,11 @@ export class ChunkedIterator implements IterableIterator(chunks: ReadonlyArray>) { + return chunks.some(chunk => chunk.nullable); +} + /** @ignore */ export function computeChunkNullCounts(chunks: ReadonlyArray>) { return chunks.reduce((nullCount, chunk) => nullCount + chunk.nullCount, 0); diff --git a/js/src/vector.ts b/js/src/vector.ts index 7e1caa343562c..8b94b14e3fff7 100644 --- a/js/src/vector.ts +++ b/js/src/vector.ts @@ -24,6 +24,7 @@ import { BigIntArray, TypedArray, TypedArrayDataType } from './interfaces.js'; import { isChunkedValid, computeChunkOffsets, + computeChunkNullable, computeChunkNullCounts, sliceChunks, wrapChunkedCall1, @@ -132,6 +133,13 @@ export class Vector { return this.data.reduce((byteLength, data) => byteLength + data.byteLength, 0); } + /** + * Whether this Vector's elements can contain null values. + */ + public get nullable() { + return computeChunkNullable(this.data); + } + /** * The number of null elements in this Vector. */ diff --git a/js/test/unit/table-tests.ts b/js/test/unit/table-tests.ts index 094988c052b6e..ffda47f473368 100644 --- a/js/test/unit/table-tests.ts +++ b/js/test/unit/table-tests.ts @@ -139,30 +139,32 @@ describe(`Table`, () => { const i32 = makeVector([i32s]); expect(i32).toHaveLength(i32s.length); expect(i32.nullCount).toBe(0); + expect(i32.nullable).toBe(true); const table = new Table({ i32 }); const i32Field = table.schema.fields[0]; expect(i32Field.name).toBe('i32'); expect(i32).toHaveLength(i32s.length); - expect(i32Field.nullable).toBe(false); + expect(i32Field.nullable).toBe(true); expect(i32.nullCount).toBe(0); expect(i32).toEqualVector(makeVector(i32s)); }); - test(`creates a new Table from a Typed Array and force nullable`, () => { + test(`creates a new Table from a Typed Array and force not nullable`, () => { const i32s = new Int32Array(arange(new Array(10))); const i32 = makeVector([i32s]); expect(i32).toHaveLength(i32s.length); expect(i32.nullCount).toBe(0); + expect(i32.nullable).toBe(true); - const table = new Table(new Schema([new Field('i32', new Int32, true)]), { i32 }); + const table = new Table(new Schema([new Field('i32', new Int32, false)]), { i32 }); const i32Field = table.schema.fields[0]; expect(i32Field.name).toBe('i32'); expect(i32).toHaveLength(i32s.length); - expect(i32Field.nullable).toBe(true); + expect(i32Field.nullable).toBe(false); expect(i32.nullCount).toBe(0); expect(i32).toEqualVector(makeVector(i32s)); @@ -187,8 +189,8 @@ describe(`Table`, () => { expect(f32Field.name).toBe('f32'); expect(i32).toHaveLength(i32s.length); expect(f32).toHaveLength(f32s.length); - expect(i32Field.nullable).toBe(false); - expect(f32Field.nullable).toBe(false); + expect(i32Field.nullable).toBe(true); + expect(f32Field.nullable).toBe(true); expect(i32.nullCount).toBe(0); expect(f32.nullCount).toBe(0); @@ -222,7 +224,7 @@ describe(`Table`, () => { expect(i32Vector).toHaveLength(i32s.length); expect(f32Vector).toHaveLength(i32s.length); // new length should be the same as the longest sibling - expect(i32Field.nullable).toBe(false); + expect(i32Field.nullable).toBe(true); expect(f32Field.nullable).toBe(true); // true, with 12 additional nulls expect(i32Vector.nullCount).toBe(0); expect(f32Vector.nullCount).toBe(i32s.length - f32s.length); @@ -264,7 +266,7 @@ describe(`Table`, () => { expect(f32RenamedField.name).toBe('f32Renamed'); expect(i32Renamed).toHaveLength(i32s.length); expect(f32Renamed).toHaveLength(i32s.length); // new length should be the same as the longest sibling - expect(i32RenamedField.nullable).toBe(false); + expect(i32RenamedField.nullable).toBe(true); expect(f32RenamedField.nullable).toBe(true); // true, with 4 additional nulls expect(i32Renamed.nullCount).toBe(0); expect(f32Renamed.nullCount).toBe(i32s.length - f32s.length); From 27d72f3a773ddbb8dd5ee679b9ed6b555a2bb8ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Thu, 4 Jan 2024 11:49:04 +0100 Subject: [PATCH 127/570] GH-39421: [CI][Ruby] Update to using Ubuntu 22.04 on test-ruby and test-c-glib nightly jobs (#39422) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change CI Jobs for Ruby and c-glib are failing on Ubuntu due to bundler failing to install on old Ruby. ### What changes are included in this PR? Use Ubuntu 22.04 on those jobs. ### Are these changes tested? Via Archery ### Are there any user-facing changes? No * Closes: #39421 Authored-by: Raúl Cumplido Signed-off-by: Raúl Cumplido --- dev/tasks/tasks.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index ed6ea08894f10..04faef427e281 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1032,6 +1032,8 @@ tasks: ci: github template: docker-tests/github.linux.yml params: + env: + UBUNTU: 22.04 image: {{ image }} {% endfor %} From ccc674c56f3473c9556a5af96dff9d156f559663 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Thu, 4 Jan 2024 12:57:25 -0500 Subject: [PATCH 128/570] GH-38964: [C++] Fix spelling (compute) (#38965) ### Rationale for this change ### What changes are included in this PR? Spelling fixes to cpp/src/arrow/compute/ ### Are these changes tested? ### Are there any user-facing changes? * Closes: #38964 Authored-by: Josh Soref <2119212+jsoref@users.noreply.github.com> Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/compute/api_aggregate.h | 2 +- cpp/src/arrow/compute/api_scalar.h | 4 +-- cpp/src/arrow/compute/api_vector.h | 6 ++-- cpp/src/arrow/compute/exec.cc | 2 +- cpp/src/arrow/compute/exec_internal.h | 2 +- cpp/src/arrow/compute/exec_test.cc | 2 +- .../arrow/compute/kernels/aggregate_basic.cc | 2 +- .../kernels/aggregate_basic_internal.h | 2 +- .../arrow/compute/kernels/aggregate_mode.cc | 2 +- .../compute/kernels/aggregate_quantile.cc | 2 +- .../arrow/compute/kernels/aggregate_test.cc | 4 +-- .../arrow/compute/kernels/hash_aggregate.cc | 4 +-- .../kernels/scalar_arithmetic_benchmark.cc | 2 +- .../compute/kernels/scalar_arithmetic_test.cc | 2 +- .../arrow/compute/kernels/scalar_cast_test.cc | 8 ++--- .../compute/kernels/scalar_if_else_test.cc | 2 +- cpp/src/arrow/compute/kernels/scalar_round.cc | 2 +- .../compute/kernels/scalar_string_internal.h | 2 +- .../compute/kernels/scalar_string_test.cc | 4 +-- .../compute/kernels/scalar_temporal_test.cc | 14 ++++---- .../compute/kernels/vector_run_end_encode.cc | 12 +++---- .../arrow/compute/kernels/vector_select_k.cc | 32 +++++++++---------- .../compute/kernels/vector_selection_test.cc | 2 +- cpp/src/arrow/compute/key_map.cc | 4 +-- cpp/src/arrow/compute/key_map.h | 4 +-- cpp/src/arrow/compute/key_map_avx2.cc | 2 +- cpp/src/arrow/compute/light_array.cc | 2 +- cpp/src/arrow/compute/light_array_test.cc | 2 +- cpp/src/arrow/compute/ordering.h | 2 +- cpp/src/arrow/compute/registry_test.cc | 2 +- cpp/src/arrow/compute/row/grouper.cc | 2 +- cpp/src/arrow/compute/row/grouper.h | 10 +++--- 32 files changed, 73 insertions(+), 73 deletions(-) diff --git a/cpp/src/arrow/compute/api_aggregate.h b/cpp/src/arrow/compute/api_aggregate.h index 4d2c814a69bbb..2e5210b073ee4 100644 --- a/cpp/src/arrow/compute/api_aggregate.h +++ b/cpp/src/arrow/compute/api_aggregate.h @@ -452,7 +452,7 @@ Result TDigest(const Datum& value, /// \brief Find the first index of a value in an array. /// /// \param[in] value The array to search. -/// \param[in] options The array to search for. See IndexOoptions. +/// \param[in] options The array to search for. See IndexOptions. /// \param[in] ctx the function execution context, optional /// \return out a Scalar containing the index (or -1 if not found). /// diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 26fbe64f74293..bad34f4a37881 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -491,7 +491,7 @@ struct ARROW_EXPORT AssumeTimezoneOptions : public FunctionOptions { /// How to interpret ambiguous local times (due to DST shifts) Ambiguous ambiguous; - /// How to interpret non-existent local times (due to DST shifts) + /// How to interpret nonexistent local times (due to DST shifts) Nonexistent nonexistent; }; @@ -1589,7 +1589,7 @@ ARROW_EXPORT Result MonthsBetween(const Datum& left, const Datum& right, ARROW_EXPORT Result WeeksBetween(const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR); -/// \brief Month Day Nano Between finds the number of months, days, and nonaseconds +/// \brief Month Day Nano Between finds the number of months, days, and nanoseconds /// between two values /// /// \param[in] left input treated as the start time diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index 759f9e5c1a408..919572f16ee69 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -401,7 +401,7 @@ Result> NthToIndices(const Array& values, int64_t n, /// \brief Return indices that partition an array around n-th sorted element. /// -/// This overload takes a PartitionNthOptions specifiying the pivot index +/// This overload takes a PartitionNthOptions specifying the pivot index /// and the null handling. /// /// \param[in] values array to be partitioned @@ -452,7 +452,7 @@ Result> SortIndices(const Array& array, /// \brief Return the indices that would sort an array. /// -/// This overload takes a ArraySortOptions specifiying the sort order +/// This overload takes a ArraySortOptions specifying the sort order /// and the null handling. /// /// \param[in] array array to sort @@ -486,7 +486,7 @@ Result> SortIndices(const ChunkedArray& chunked_array, /// \brief Return the indices that would sort a chunked array. /// -/// This overload takes a ArraySortOptions specifiying the sort order +/// This overload takes a ArraySortOptions specifying the sort order /// and the null handling. /// /// \param[in] chunked_array chunked array to sort diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc index c18dfa0952245..28dcf493fa294 100644 --- a/cpp/src/arrow/compute/exec.cc +++ b/cpp/src/arrow/compute/exec.cc @@ -1164,7 +1164,7 @@ class ScalarAggExecutor : public KernelExecutorImpl { // TODO(wesm): this is odd and should be examined soon -- only one state // "should" be needed per thread of execution - // FIXME(ARROW-11840) don't merge *any* aggegates for every batch + // FIXME(ARROW-11840) don't merge *any* aggregates for every batch ARROW_ASSIGN_OR_RAISE(auto batch_state, kernel_->init(kernel_ctx_, {kernel_, *input_types_, options_})); diff --git a/cpp/src/arrow/compute/exec_internal.h b/cpp/src/arrow/compute/exec_internal.h index 8beff2a6c63ac..7e4f364a9288e 100644 --- a/cpp/src/arrow/compute/exec_internal.h +++ b/cpp/src/arrow/compute/exec_internal.h @@ -46,7 +46,7 @@ class ARROW_EXPORT ExecSpanIterator { public: ExecSpanIterator() = default; - /// \brief Initialize itertor iterator and do basic argument validation + /// \brief Initialize iterator and do basic argument validation /// /// \param[in] batch the input ExecBatch /// \param[in] max_chunksize the maximum length of each ExecSpan. Depending diff --git a/cpp/src/arrow/compute/exec_test.cc b/cpp/src/arrow/compute/exec_test.cc index d661e5735fea6..cfce0c57fa416 100644 --- a/cpp/src/arrow/compute/exec_test.cc +++ b/cpp/src/arrow/compute/exec_test.cc @@ -1232,7 +1232,7 @@ void TestCallScalarFunctionPreallocationCases::DoTest(FunctionCallerMaker caller } // Set the exec_chunksize to be smaller, so now we have several invocations - // of the kernel, but still the output is onee array + // of the kernel, but still the output is one array { std::vector args = {Datum(arr)}; exec_ctx_->set_exec_chunksize(80); diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index ddd241652460e..1fbcd6a249093 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -1100,7 +1100,7 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { AddFirstLastKernels(FirstLastInit, TemporalTypes(), func.get()); DCHECK_OK(registry->AddFunction(std::move(func))); - // Add first/last as convience functions + // Add first/last as convenience functions func = std::make_shared("first", Arity::Unary(), first_doc, &default_scalar_aggregate_options); AddFirstOrLastAggKernel(func.get(), first_last_func); diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index 4966e9871d62c..f08e7aaa538bb 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -472,7 +472,7 @@ struct FirstLastImpl : public ScalarAggregator { this->count += arr.length() - null_count; if (null_count == 0) { - // If there are no null valus, we can just merge + // If there are no null values, we can just merge // the first and last element this->state.MergeOne(arr.GetView(0)); this->state.MergeOne(arr.GetView(arr.length() - 1)); diff --git a/cpp/src/arrow/compute/kernels/aggregate_mode.cc b/cpp/src/arrow/compute/kernels/aggregate_mode.cc index 7f359ead6cb83..3f84c0a5ee4c4 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_mode.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_mode.cc @@ -115,7 +115,7 @@ Status Finalize(KernelContext* ctx, const DataType& type, ExecResult* out, return Status::OK(); } -// count value occurances for integers with narrow value range +// count value occurrences for integers with narrow value range // O(1) space, O(n) time template struct CountModer { diff --git a/cpp/src/arrow/compute/kernels/aggregate_quantile.cc b/cpp/src/arrow/compute/kernels/aggregate_quantile.cc index e675a1cec86c9..f4826229dd46c 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_quantile.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_quantile.cc @@ -120,7 +120,7 @@ struct SortQuantiler { }); // input array is partitioned around data point at `last_index` (pivot) - // for next quatile which is smaller, we only consider inputs left of the pivot + // for next quantile which is smaller, we only consider inputs left of the pivot uint64_t last_index = in_buffer.size(); if (is_datapoint) { CType* out_buffer = out_data->template GetMutableValues(1); diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index aa19fb3401232..65439af2748b5 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -478,7 +478,7 @@ TEST_F(TestSumKernelRoundOff, Basics) { // array = np.arange(321000, dtype='float64') // array -= np.mean(array) - // array *= arrray + // array *= array double index = 0; ASSERT_OK_AND_ASSIGN( auto array, ArrayFromBuilderVisitor( @@ -3653,7 +3653,7 @@ class TestPrimitiveQuantileKernel : public ::testing::Test { #define INTYPE(x) Datum(static_cast(x)) #define DOUBLE(x) Datum(static_cast(x)) -// output type per interplation: linear, lower, higher, nearest, midpoint +// output type per interpolation: linear, lower, higher, nearest, midpoint #define O(a, b, c, d, e) \ { DOUBLE(a), INTYPE(b), INTYPE(c), INTYPE(d), DOUBLE(e) } diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc index 47cae538e2e3f..c37e45513d040 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc @@ -1848,8 +1848,8 @@ struct GroupedFirstLastImpl final : public GroupedAggregator { const ArrayData& group_id_mapping) override { // The merge is asymmetric. "first" from this state gets pick over "first" from other // state. "last" from other state gets pick over from this state. This is so that when - // using with segmeneted aggregation, we still get the correct "first" and "last" - // value for the entire segement. + // using with segmented aggregation, we still get the correct "first" and "last" + // value for the entire segment. auto other = checked_cast(&raw_other); auto raw_firsts = firsts_.mutable_data(); diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic_benchmark.cc index 4b678da5f1b42..17e9951d69bc2 100644 --- a/cpp/src/arrow/compute/kernels/scalar_arithmetic_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic_benchmark.cc @@ -33,7 +33,7 @@ constexpr auto kSeed = 0x94378165; using BinaryOp = Result(const Datum&, const Datum&, ArithmeticOptions, ExecContext*); -// Add explicit overflow-checked shortcuts, for easy benchmark parametering. +// Add explicit overflow-checked shortcuts, for easy benchmark parameterizing. static Result AddChecked(const Datum& left, const Datum& right, ArithmeticOptions options = ArithmeticOptions(), ExecContext* ctx = NULLPTR) { diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc index 756b3028c4a59..37a1bcbc02d73 100644 --- a/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc @@ -1857,7 +1857,7 @@ TEST_F(TestBinaryArithmeticDecimal, DispatchBest) { } } -// reference result from bc (precsion=100, scale=40) +// reference result from bc (precision=100, scale=40) TEST_F(TestBinaryArithmeticDecimal, AddSubtract) { // array array, decimal128 { diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index b429c8175b020..a8acf68f66c8b 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -2842,19 +2842,19 @@ TEST(Cast, StructToDifferentNullabilityStruct) { ::testing::HasSubstr("cannot cast nullable field to non-nullable field"), Cast(src_nullable, options1_non_nullable)); - std::vector> fields_dest2_non_nullble = { + std::vector> fields_dest2_non_nullable = { std::make_shared("a", int64(), false), std::make_shared("c", int64(), false)}; - const auto dest2_non_nullable = arrow::struct_(fields_dest2_non_nullble); + const auto dest2_non_nullable = arrow::struct_(fields_dest2_non_nullable); const auto options2_non_nullable = CastOptions::Safe(dest2_non_nullable); EXPECT_RAISES_WITH_MESSAGE_THAT( TypeError, ::testing::HasSubstr("cannot cast nullable field to non-nullable field"), Cast(src_nullable, options2_non_nullable)); - std::vector> fields_dest3_non_nullble = { + std::vector> fields_dest3_non_nullable = { std::make_shared("c", int64(), false)}; - const auto dest3_non_nullable = arrow::struct_(fields_dest3_non_nullble); + const auto dest3_non_nullable = arrow::struct_(fields_dest3_non_nullable); const auto options3_non_nullable = CastOptions::Safe(dest3_non_nullable); EXPECT_RAISES_WITH_MESSAGE_THAT( TypeError, diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc index 771261cac9140..c4c46b5efe84d 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc @@ -69,7 +69,7 @@ template class TestIfElsePrimitive : public ::testing::Test {}; // There are a lot of tests here if we cover all the types and it gets slow on valgrind -// so we overrdie the standard type sets with a smaller range +// so we override the standard type sets with a smaller range #ifdef ARROW_VALGRIND using IfElseNumericBasedTypes = ::testing::Types::Round(round_val); } - // Equality check is ommitted so that the common case of 10^0 (integer rounding) + // Equality check is omitted so that the common case of 10^0 (integer rounding) // uses multiply-only round_val = ndigits > 0 ? (round_val / pow10) : (round_val * pow10); if (!std::isfinite(round_val)) { diff --git a/cpp/src/arrow/compute/kernels/scalar_string_internal.h b/cpp/src/arrow/compute/kernels/scalar_string_internal.h index 1a9969441655d..7a5d5a7c86e85 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_internal.h +++ b/cpp/src/arrow/compute/kernels/scalar_string_internal.h @@ -306,7 +306,7 @@ struct StringSplitExec { using ListOffsetsBuilderType = TypedBufferBuilder; using State = OptionsWrapper; - // Keep the temporary storage accross individual values, to minimize reallocations + // Keep the temporary storage across individual values, to minimize reallocations std::vector parts; Options options; diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index ff14f5e7a5c5d..5dec16d89e29c 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -2060,7 +2060,7 @@ TYPED_TEST(TestStringKernels, SliceCodeunitsBasic) { this->CheckUnary("utf8_slice_codeunits", R"(["𝑓öõḍš"])", this->type(), R"([""])", &options_edgecase_1); - // this is a safeguard agains an optimization path possible, but actually a tricky case + // this is a safeguard against an optimization path possible, but actually a tricky case SliceOptions options_edgecase_2{-6, -2}; this->CheckUnary("utf8_slice_codeunits", R"(["𝑓öõḍš"])", this->type(), R"(["𝑓öõ"])", &options_edgecase_2); @@ -2189,7 +2189,7 @@ TYPED_TEST(TestBinaryKernels, SliceBytesBasic) { "ds\"]", this->type(), R"([""])", &options_edgecase_1); - // this is a safeguard agains an optimization path possible, but actually a tricky case + // this is a safeguard against an optimization path possible, but actually a tricky case SliceOptions options_edgecase_2{-6, -2}; this->CheckUnary("binary_slice", "[\"f\xc2\xa2" diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc index d8bbe5ca8a34c..d4482334285bc 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc @@ -2101,9 +2101,9 @@ TEST_F(ScalarTemporalTest, StrftimeNoTimezone) { TEST_F(ScalarTemporalTest, StrftimeInvalidTimezone) { const char* seconds = R"(["1970-01-01T00:00:59", null])"; - auto arr = ArrayFromJSON(timestamp(TimeUnit::SECOND, "non-existent"), seconds); + auto arr = ArrayFromJSON(timestamp(TimeUnit::SECOND, "nonexistent"), seconds); EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, testing::HasSubstr("Cannot locate timezone 'non-existent'"), + Invalid, testing::HasSubstr("Cannot locate timezone 'nonexistent'"), Strftime(arr, StrftimeOptions())); } @@ -2159,12 +2159,12 @@ TEST_F(ScalarTemporalTest, StrftimeOtherLocale) { } TEST_F(ScalarTemporalTest, StrftimeInvalidLocale) { - auto options = StrftimeOptions("%d %B %Y %H:%M:%S", "non-existent"); + auto options = StrftimeOptions("%d %B %Y %H:%M:%S", "nonexistent"); const char* seconds = R"(["1970-01-01T00:00:59", null])"; auto arr = ArrayFromJSON(timestamp(TimeUnit::SECOND, "UTC"), seconds); EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, - testing::HasSubstr("Cannot find locale 'non-existent'"), + testing::HasSubstr("Cannot find locale 'nonexistent'"), Strftime(arr, options)); } @@ -2601,7 +2601,7 @@ TEST_F(ScalarTemporalTestStrictCeil, TestCeilTemporalStrictCeil) { TEST_F(ScalarTemporalTestMultipleSinceGreaterUnit, CeilUTC) { std::string op = "ceil_temporal"; - // Data for tests below was generaed via lubridate with the exception + // Data for tests below was generated via lubridate with the exception // of week data because lubridate currently does not support rounding to // multiple of week. const char* ceil_15_nanosecond = @@ -2989,7 +2989,7 @@ TEST_F(ScalarTemporalTest, TestFloorTemporal) { TEST_F(ScalarTemporalTestMultipleSinceGreaterUnit, FloorUTC) { std::string op = "floor_temporal"; - // Data for tests below was generaed via lubridate with the exception + // Data for tests below was generated via lubridate with the exception // of week data because lubridate currently does not support rounding to // multiple of week. const char* floor_15_nanosecond = @@ -3402,7 +3402,7 @@ TEST_F(ScalarTemporalTest, TestCeilFloorRoundTemporalBrussels) { TEST_F(ScalarTemporalTestMultipleSinceGreaterUnit, RoundUTC) { std::string op = "round_temporal"; - // Data for tests below was generaed via lubridate with the exception + // Data for tests below was generated via lubridate with the exception // of week data because lubridate currently does not support rounding to // multiple of week. const char* round_15_nanosecond = diff --git a/cpp/src/arrow/compute/kernels/vector_run_end_encode.cc b/cpp/src/arrow/compute/kernels/vector_run_end_encode.cc index 943fdcd6b147f..811ed23e1134b 100644 --- a/cpp/src/arrow/compute/kernels/vector_run_end_encode.cc +++ b/cpp/src/arrow/compute/kernels/vector_run_end_encode.cc @@ -30,11 +30,11 @@ namespace compute { namespace internal { namespace { -struct RunEndEncondingState : public KernelState { - explicit RunEndEncondingState(std::shared_ptr run_end_type) +struct RunEndEncodingState : public KernelState { + explicit RunEndEncodingState(std::shared_ptr run_end_type) : run_end_type{std::move(run_end_type)} {} - ~RunEndEncondingState() override = default; + ~RunEndEncodingState() override = default; std::shared_ptr run_end_type; }; @@ -273,7 +273,7 @@ struct RunEndEncodeExec { template static Status Exec(KernelContext* ctx, const ExecSpan& span, ExecResult* result) { - auto state = checked_cast(ctx->state()); + auto state = checked_cast(ctx->state()); switch (state->run_end_type->id()) { case Type::INT16: return DoExec(ctx, span, result); @@ -290,7 +290,7 @@ struct RunEndEncodeExec { /// \brief The OutputType::Resolver of the "run_end_decode" function. static Result ResolveOutputType( KernelContext* ctx, const std::vector& input_types) { - auto state = checked_cast(ctx->state()); + auto state = checked_cast(ctx->state()); return TypeHolder(std::make_shared(state->run_end_type, input_types[0].GetSharedPtr())); } @@ -301,7 +301,7 @@ Result> RunEndEncodeInit(KernelContext*, auto* options = checked_cast(args.options); auto run_end_type = options ? options->run_end_type : RunEndEncodeOptions::Defaults().run_end_type; - return std::make_unique(std::move(run_end_type)); + return std::make_unique(std::move(run_end_type)); } template diff --git a/cpp/src/arrow/compute/kernels/vector_select_k.cc b/cpp/src/arrow/compute/kernels/vector_select_k.cc index 1740a9b7f0bb4..97996e6d52cc0 100644 --- a/cpp/src/arrow/compute/kernels/vector_select_k.cc +++ b/cpp/src/arrow/compute/kernels/vector_select_k.cc @@ -72,9 +72,9 @@ class SelectKComparator { } }; -class ArraySelecter : public TypeVisitor { +class ArraySelector : public TypeVisitor { public: - ArraySelecter(ExecContext* ctx, const Array& array, const SelectKOptions& options, + ArraySelector(ExecContext* ctx, const Array& array, const SelectKOptions& options, Datum* output) : TypeVisitor(), ctx_(ctx), @@ -164,9 +164,9 @@ struct TypedHeapItem { ArrayType* array; }; -class ChunkedArraySelecter : public TypeVisitor { +class ChunkedArraySelector : public TypeVisitor { public: - ChunkedArraySelecter(ExecContext* ctx, const ChunkedArray& chunked_array, + ChunkedArraySelector(ExecContext* ctx, const ChunkedArray& chunked_array, const SelectKOptions& options, Datum* output) : TypeVisitor(), chunked_array_(chunked_array), @@ -273,13 +273,13 @@ class ChunkedArraySelecter : public TypeVisitor { Datum* output_; }; -class RecordBatchSelecter : public TypeVisitor { +class RecordBatchSelector : public TypeVisitor { private: using ResolvedSortKey = ResolvedRecordBatchSortKey; using Comparator = MultipleKeyComparator; public: - RecordBatchSelecter(ExecContext* ctx, const RecordBatch& record_batch, + RecordBatchSelector(ExecContext* ctx, const RecordBatch& record_batch, const SelectKOptions& options, Datum* output) : TypeVisitor(), ctx_(ctx), @@ -391,7 +391,7 @@ class RecordBatchSelecter : public TypeVisitor { Comparator comparator_; }; -class TableSelecter : public TypeVisitor { +class TableSelector : public TypeVisitor { private: struct ResolvedSortKey { ResolvedSortKey(const std::shared_ptr& chunked_array, @@ -420,7 +420,7 @@ class TableSelecter : public TypeVisitor { using Comparator = MultipleKeyComparator; public: - TableSelecter(ExecContext* ctx, const Table& table, const SelectKOptions& options, + TableSelector(ExecContext* ctx, const Table& table, const SelectKOptions& options, Datum* output) : TypeVisitor(), ctx_(ctx), @@ -610,32 +610,32 @@ class SelectKUnstableMetaFunction : public MetaFunction { Result SelectKth(const Array& array, const SelectKOptions& options, ExecContext* ctx) const { Datum output; - ArraySelecter selecter(ctx, array, options, &output); - ARROW_RETURN_NOT_OK(selecter.Run()); + ArraySelector selector(ctx, array, options, &output); + ARROW_RETURN_NOT_OK(selector.Run()); return output; } Result SelectKth(const ChunkedArray& chunked_array, const SelectKOptions& options, ExecContext* ctx) const { Datum output; - ChunkedArraySelecter selecter(ctx, chunked_array, options, &output); - ARROW_RETURN_NOT_OK(selecter.Run()); + ChunkedArraySelector selector(ctx, chunked_array, options, &output); + ARROW_RETURN_NOT_OK(selector.Run()); return output; } Result SelectKth(const RecordBatch& record_batch, const SelectKOptions& options, ExecContext* ctx) const { ARROW_RETURN_NOT_OK(CheckConsistency(*record_batch.schema(), options.sort_keys)); Datum output; - RecordBatchSelecter selecter(ctx, record_batch, options, &output); - ARROW_RETURN_NOT_OK(selecter.Run()); + RecordBatchSelector selector(ctx, record_batch, options, &output); + ARROW_RETURN_NOT_OK(selector.Run()); return output; } Result SelectKth(const Table& table, const SelectKOptions& options, ExecContext* ctx) const { ARROW_RETURN_NOT_OK(CheckConsistency(*table.schema(), options.sort_keys)); Datum output; - TableSelecter selecter(ctx, table, options, &output); - ARROW_RETURN_NOT_OK(selecter.Run()); + TableSelector selector(ctx, table, options, &output); + ARROW_RETURN_NOT_OK(selector.Run()); return output; } }; diff --git a/cpp/src/arrow/compute/kernels/vector_selection_test.cc b/cpp/src/arrow/compute/kernels/vector_selection_test.cc index 30e85c1f71089..bdf9f5454fdef 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_test.cc @@ -2488,7 +2488,7 @@ TEST(TestIndicesNonZero, IndicesNonZeroBoolean) { Datum actual; std::shared_ptr result; - // boool + // bool ASSERT_OK_AND_ASSIGN( actual, CallFunction("indices_nonzero", {ArrayFromJSON(boolean(), "[null, true, false, true]")})); diff --git a/cpp/src/arrow/compute/key_map.cc b/cpp/src/arrow/compute/key_map.cc index 525dae850f19b..a027ec811cf24 100644 --- a/cpp/src/arrow/compute/key_map.cc +++ b/cpp/src/arrow/compute/key_map.cc @@ -505,7 +505,7 @@ void SwissTable::find(const int num_keys, const uint32_t* hashes, // Slow processing of input keys in the most generic case. // Handles inserting new keys. -// Pre-existing keys will be handled correctly, although the intended use is for this +// Preexisting keys will be handled correctly, although the intended use is for this // call to follow a call to find() method, which would only pass on new keys that were // not present in the hash table. // @@ -617,7 +617,7 @@ Status SwissTable::map_new_keys(uint32_t num_ids, uint16_t* ids, const uint32_t* ARROW_DCHECK(static_cast(num_ids) <= (1 << log_minibatch_)); ARROW_DCHECK(static_cast(max_id + 1) <= (1 << log_minibatch_)); - // Allocate temporary buffers for slot ids and intialize them + // Allocate temporary buffers for slot ids and initialize them auto slot_ids_buf = util::TempVectorHolder(temp_stack, max_id + 1); uint32_t* slot_ids = slot_ids_buf.mutable_data(); init_slot_ids_for_new_keys(num_ids, ids, hashes, slot_ids); diff --git a/cpp/src/arrow/compute/key_map.h b/cpp/src/arrow/compute/key_map.h index 85ef9029d6fc9..8e06dc83483aa 100644 --- a/cpp/src/arrow/compute/key_map.h +++ b/cpp/src/arrow/compute/key_map.h @@ -142,7 +142,7 @@ class ARROW_EXPORT SwissTable { void extract_group_ids_imp(const int num_keys, const uint16_t* selection, const uint32_t* hashes, const uint8_t* local_slots, uint32_t* out_group_ids, int elements_offset, - int element_mutltiplier) const; + int element_multiplier) const; inline uint64_t next_slot_to_visit(uint64_t block_index, int slot, int match_found) const; @@ -187,7 +187,7 @@ class ARROW_EXPORT SwissTable { // Slow processing of input keys in the most generic case. // Handles inserting new keys. - // Pre-existing keys will be handled correctly, although the intended use is for this + // Preexisting keys will be handled correctly, although the intended use is for this // call to follow a call to find() method, which would only pass on new keys that were // not present in the hash table. // diff --git a/cpp/src/arrow/compute/key_map_avx2.cc b/cpp/src/arrow/compute/key_map_avx2.cc index 731553511044f..3526a6cb0f344 100644 --- a/cpp/src/arrow/compute/key_map_avx2.cc +++ b/cpp/src/arrow/compute/key_map_avx2.cc @@ -117,7 +117,7 @@ int SwissTable::early_filter_imp_avx2_x8(const int num_hashes, const uint32_t* h vlocal_slot = _mm256_add_epi32(_mm256_and_si256(vlocal_slot, _mm256_set1_epi32(0xff)), _mm256_and_si256(vgt, _mm256_set1_epi32(4))); - // Convert slot id relative to the block to slot id relative to the beginnning of the + // Convert slot id relative to the block to slot id relative to the beginning of the // table // uint64_t local_slot = _mm256_extract_epi64( diff --git a/cpp/src/arrow/compute/light_array.cc b/cpp/src/arrow/compute/light_array.cc index 93a054de1957c..73ea01a03a8fa 100644 --- a/cpp/src/arrow/compute/light_array.cc +++ b/cpp/src/arrow/compute/light_array.cc @@ -89,7 +89,7 @@ KeyColumnArray KeyColumnArray::Slice(int64_t offset, int64_t length) const { sliced.bit_offset_[0] = (bit_offset_[0] + offset) % 8; if (metadata_.fixed_length == 0 && !metadata_.is_null_type) { - ARROW_DCHECK(is_bool_type()) << "Expected BOOL type type but got a different type."; + ARROW_DCHECK(is_bool_type()) << "Expected BOOL type but got a different type."; sliced.buffers_[1] = buffers_[1] ? buffers_[1] + (bit_offset_[1] + offset) / 8 : nullptr; sliced.mutable_buffers_[1] = mutable_buffers_[1] diff --git a/cpp/src/arrow/compute/light_array_test.cc b/cpp/src/arrow/compute/light_array_test.cc index 52121530fe91d..3ceba43604b28 100644 --- a/cpp/src/arrow/compute/light_array_test.cc +++ b/cpp/src/arrow/compute/light_array_test.cc @@ -333,7 +333,7 @@ TEST(ResizableArrayData, Binary) { ASSERT_EQ(0, array.num_rows()); ASSERT_OK(array.ResizeFixedLengthBuffers(2)); ASSERT_EQ(2, array.num_rows()); - // At this point the offets memory has been allocated and needs to be filled + // At this point the offsets memory has been allocated and needs to be filled // in before we allocate the variable length memory int offsets_width = static_cast(arrow::internal::checked_pointer_cast(type) diff --git a/cpp/src/arrow/compute/ordering.h b/cpp/src/arrow/compute/ordering.h index e581269cc20dd..61caa2b570dd3 100644 --- a/cpp/src/arrow/compute/ordering.h +++ b/cpp/src/arrow/compute/ordering.h @@ -52,7 +52,7 @@ class ARROW_EXPORT SortKey : public util::EqualityComparable { bool Equals(const SortKey& other) const; std::string ToString() const; - /// A FieldRef targetting the sort column. + /// A FieldRef targeting the sort column. FieldRef target; /// How to order by this sort key. SortOrder order; diff --git a/cpp/src/arrow/compute/registry_test.cc b/cpp/src/arrow/compute/registry_test.cc index 2d69f119df1f4..3dc14bcff83ee 100644 --- a/cpp/src/arrow/compute/registry_test.cc +++ b/cpp/src/arrow/compute/registry_test.cc @@ -69,7 +69,7 @@ TEST_P(TestRegistry, Basics) { ASSERT_OK_AND_ASSIGN(std::shared_ptr f1, registry_->GetFunction("f1")); ASSERT_EQ("f1", f1->name()); - // Non-existent function + // Nonexistent function ASSERT_RAISES(KeyError, registry_->GetFunction("f2")); // Try adding a function with name collision diff --git a/cpp/src/arrow/compute/row/grouper.cc b/cpp/src/arrow/compute/row/grouper.cc index b3d28ef19a1a0..5e23eda16fda2 100644 --- a/cpp/src/arrow/compute/row/grouper.cc +++ b/cpp/src/arrow/compute/row/grouper.cc @@ -210,7 +210,7 @@ struct SimpleKeySegmenter : public BaseRowSegmenter { private: TypeHolder key_type_; - std::vector save_key_data_; // previusly seen segment-key grouping data + std::vector save_key_data_; // previously seen segment-key grouping data bool extend_was_called_; }; diff --git a/cpp/src/arrow/compute/row/grouper.h b/cpp/src/arrow/compute/row/grouper.h index 15f00eaac2191..628a9c14f3e44 100644 --- a/cpp/src/arrow/compute/row/grouper.h +++ b/cpp/src/arrow/compute/row/grouper.h @@ -29,12 +29,12 @@ namespace arrow { namespace compute { /// \brief A segment -/// A segment group is a chunk of continous rows that have the same segment key. (For +/// A segment group is a chunk of continuous rows that have the same segment key. (For /// example, in ordered time series processing, segment key can be "date", and a segment /// group can be all the rows that belong to the same date.) A segment group can span -/// across multiple exec batches. A segment is a chunk of continous rows that has the same -/// segment key within a given batch. When a segment group span cross batches, it will -/// have multiple segments. A segment never spans cross batches. The segment data +/// across multiple exec batches. A segment is a chunk of continuous rows that has the +/// same segment key within a given batch. When a segment group span cross batches, it +/// will have multiple segments. A segment never spans cross batches. The segment data /// structure only makes sense when used along with a exec batch. struct ARROW_EXPORT Segment { /// \brief the offset into the batch where the segment starts @@ -92,7 +92,7 @@ class ARROW_EXPORT RowSegmenter { /// \brief Reset this segmenter /// /// A segmenter normally extends (see `Segment`) a segment from one batch to the next. - /// If segment-extenion is undesirable, for example when each batch is processed + /// If segment-extension is undesirable, for example when each batch is processed /// independently, then `Reset` should be invoked before processing the next batch. virtual Status Reset() = 0; From 83cba25017a5c3a03e47f1851f242fa284f93533 Mon Sep 17 00:00:00 2001 From: Yue Date: Fri, 5 Jan 2024 03:02:40 +0800 Subject: [PATCH 129/570] GH-37848: [C++][Gandiva] Migrate LLVM JIT engine from MCJIT to ORC v2/LLJIT (#39098) ### Rationale for this change Gandiva currently employs MCJIT as its internal JIT engine. However, LLVM has introduced a newer JIT API known as ORC v2/LLJIT since LLVM 7.0, and it has several advantage over MCJIT, in particular, MCJIT is not actively maintained, and is slated for eventual deprecation and removal. ### What changes are included in this PR? * This PR replaces the MCJIT JIT engine with the ORC v2 engine, using the `LLJIT` API. * This PR adds a new JIT linker option `JITLink` (https://llvm.org/docs/JITLink.html), which can be used together with `LLJIT`, for LLVM 14+ on Linux/macOS platform. It is turned off by default but could be turned on with environment variable `GANDIVA_USE_JIT_LINK` ### Are these changes tested? Yes, they are covered by existing unit tests ### Are there any user-facing changes? * `Configuration` class has a new option called `dump_ir`. If users would like to call `DumpIR` API of `Projector` and `Filter`, they have to set the `dump_ir` option first. * Closes: #37848 Authored-by: Yue Ni Signed-off-by: Sutou Kouhei --- cpp/cmake_modules/FindLLVMAlt.cmake | 2 +- cpp/src/gandiva/configuration.h | 17 +- cpp/src/gandiva/engine.cc | 357 ++++++++++++++-------- cpp/src/gandiva/engine.h | 46 ++- cpp/src/gandiva/engine_llvm_test.cc | 26 +- cpp/src/gandiva/filter.cc | 8 +- cpp/src/gandiva/filter.h | 2 +- cpp/src/gandiva/llvm_generator.cc | 23 +- cpp/src/gandiva/llvm_generator.h | 12 +- cpp/src/gandiva/llvm_generator_test.cc | 21 +- cpp/src/gandiva/projector.cc | 8 +- cpp/src/gandiva/projector.h | 2 +- cpp/src/gandiva/tests/micro_benchmarks.cc | 31 ++ cpp/src/gandiva/tests/test_util.cc | 4 + cpp/src/gandiva/tests/test_util.h | 2 + python/pyarrow/gandiva.pyx | 59 +++- python/pyarrow/includes/libgandiva.pxd | 14 +- python/pyarrow/tests/test_gandiva.py | 6 +- 18 files changed, 441 insertions(+), 199 deletions(-) diff --git a/cpp/cmake_modules/FindLLVMAlt.cmake b/cpp/cmake_modules/FindLLVMAlt.cmake index 69f680824b082..2730f829817f6 100644 --- a/cpp/cmake_modules/FindLLVMAlt.cmake +++ b/cpp/cmake_modules/FindLLVMAlt.cmake @@ -93,8 +93,8 @@ if(LLVM_FOUND) debuginfodwarf ipo linker - mcjit native + orcjit target) if(LLVM_VERSION_MAJOR GREATER_EQUAL 14) list(APPEND LLVM_TARGET_COMPONENTS passes) diff --git a/cpp/src/gandiva/configuration.h b/cpp/src/gandiva/configuration.h index f43a2b190731f..620c58537f963 100644 --- a/cpp/src/gandiva/configuration.h +++ b/cpp/src/gandiva/configuration.h @@ -37,10 +37,12 @@ class GANDIVA_EXPORT Configuration { explicit Configuration(bool optimize, std::shared_ptr function_registry = - gandiva::default_function_registry()) + gandiva::default_function_registry(), + bool dump_ir = false) : optimize_(optimize), target_host_cpu_(true), - function_registry_(function_registry) {} + function_registry_(std::move(function_registry)), + dump_ir_(dump_ir) {} Configuration() : Configuration(true) {} @@ -50,11 +52,13 @@ class GANDIVA_EXPORT Configuration { bool optimize() const { return optimize_; } bool target_host_cpu() const { return target_host_cpu_; } + bool dump_ir() const { return dump_ir_; } std::shared_ptr function_registry() const { return function_registry_; } void set_optimize(bool optimize) { optimize_ = optimize; } + void set_dump_ir(bool dump_ir) { dump_ir_ = dump_ir; } void target_host_cpu(bool target_host_cpu) { target_host_cpu_ = target_host_cpu; } void set_function_registry(std::shared_ptr function_registry) { function_registry_ = std::move(function_registry); @@ -65,6 +69,9 @@ class GANDIVA_EXPORT Configuration { bool target_host_cpu_; /* set the mcpu flag to host cpu while compiling llvm ir */ std::shared_ptr function_registry_; /* function registry that may contain external functions */ + // flag indicating if IR dumping is needed, defaults to false, and turning it on will + // negatively affect performance + bool dump_ir_ = false; }; /// \brief configuration builder for gandiva @@ -83,6 +90,12 @@ class GANDIVA_EXPORT ConfigurationBuilder { return configuration; } + std::shared_ptr build_with_ir_dumping(bool dump_ir) { + std::shared_ptr configuration( + new Configuration(true, gandiva::default_function_registry(), dump_ir)); + return configuration; + } + std::shared_ptr build( std::shared_ptr function_registry) { std::shared_ptr configuration( diff --git a/cpp/src/gandiva/engine.cc b/cpp/src/gandiva/engine.cc index 1cea1fd2cbf30..fc047f2ac0763 100644 --- a/cpp/src/gandiva/engine.cc +++ b/cpp/src/gandiva/engine.cc @@ -31,7 +31,8 @@ #include #include -#include "arrow/util/logging.h" +#include +#include #if defined(_MSC_VER) #pragma warning(push) @@ -46,13 +47,14 @@ #include #include #include -#include +#include #include #include #include #include #include #include +#include #if LLVM_VERSION_MAJOR >= 17 #include #else @@ -86,6 +88,13 @@ #include #include +// JITLink is available in LLVM 9+ +// but the `InProcessMemoryManager::Create` API was added since LLVM 14 +#if LLVM_VERSION_MAJOR >= 14 && !defined(_WIN32) +#define JIT_LINK_SUPPORTED +#include +#endif + #if defined(_MSC_VER) #pragma warning(pop) #endif @@ -103,9 +112,136 @@ extern const size_t kPrecompiledBitcodeSize; std::once_flag llvm_init_once_flag; static bool llvm_init = false; static llvm::StringRef cpu_name; -static llvm::SmallVector cpu_attrs; +static std::vector cpu_attrs; std::once_flag register_exported_funcs_flag; +template +arrow::Result AsArrowResult(llvm::Expected& expected, + const std::string& error_context) { + if (!expected) { + return Status::CodeGenError(error_context, llvm::toString(expected.takeError())); + } + return std::move(expected.get()); +} + +Result MakeTargetMachineBuilder( + const Configuration& conf) { + llvm::orc::JITTargetMachineBuilder jtmb( + (llvm::Triple(llvm::sys::getDefaultTargetTriple()))); + if (conf.target_host_cpu()) { + jtmb.setCPU(cpu_name.str()); + jtmb.addFeatures(cpu_attrs); + } + auto const opt_level = + conf.optimize() ? llvm::CodeGenOpt::Aggressive : llvm::CodeGenOpt::None; + jtmb.setCodeGenOptLevel(opt_level); + return jtmb; +} + +std::string DumpModuleIR(const llvm::Module& module) { + std::string ir; + llvm::raw_string_ostream stream(ir); + module.print(stream, nullptr); + return ir; +} + +void AddAbsoluteSymbol(llvm::orc::LLJIT& lljit, const std::string& name, + void* function_ptr) { + llvm::orc::MangleAndInterner mangle(lljit.getExecutionSession(), lljit.getDataLayout()); + + // https://github.com/llvm/llvm-project/commit/8b1771bd9f304be39d4dcbdcccedb6d3bcd18200#diff-77984a824d9182e5c67a481740f3bc5da78d5bd4cf6e1716a083ddb30a4a4931 + // LLVM 17 introduced ExecutorSymbolDef and move most of ORC APIs to ExecutorAddr +#if LLVM_VERSION_MAJOR >= 17 + llvm::orc::ExecutorSymbolDef symbol( + llvm::orc::ExecutorAddr(reinterpret_cast(function_ptr)), + llvm::JITSymbolFlags::Exported); +#else + llvm::JITEvaluatedSymbol symbol(reinterpret_cast(function_ptr), + llvm::JITSymbolFlags::Exported); +#endif + + auto error = lljit.getMainJITDylib().define( + llvm::orc::absoluteSymbols({{mangle(name), symbol}})); + llvm::cantFail(std::move(error)); +} + +// add current process symbol to dylib +// LLVM >= 18 does this automatically +void AddProcessSymbol(llvm::orc::LLJIT& lljit) { + lljit.getMainJITDylib().addGenerator( + llvm::cantFail(llvm::orc::DynamicLibrarySearchGenerator::GetForCurrentProcess( + lljit.getDataLayout().getGlobalPrefix()))); + // the `atexit` symbol cannot be found for ASAN +#ifdef ADDRESS_SANITIZER + if (!lljit.lookup("atexit")) { + AddAbsoluteSymbol(lljit, "atexit", reinterpret_cast(atexit)); + } +#endif +} + +#ifdef JIT_LINK_SUPPORTED +Result> CreateMemmoryManager() { + auto maybe_mem_manager = llvm::jitlink::InProcessMemoryManager::Create(); + return AsArrowResult(maybe_mem_manager, "Could not create memory manager: "); +} + +Status UseJITLinkIfEnabled(llvm::orc::LLJITBuilder& jit_builder) { + static auto maybe_use_jit_link = ::arrow::internal::GetEnvVar("GANDIVA_USE_JIT_LINK"); + if (maybe_use_jit_link.ok()) { + ARROW_ASSIGN_OR_RAISE(static auto memory_manager, CreateMemmoryManager()); + jit_builder.setObjectLinkingLayerCreator( + [&](llvm::orc::ExecutionSession& ES, const llvm::Triple& TT) { + return std::make_unique(ES, *memory_manager); + }); + } + return Status::OK(); +} +#endif + +Result> BuildJIT( + llvm::orc::JITTargetMachineBuilder jtmb, + std::optional>& object_cache) { + llvm::orc::LLJITBuilder jit_builder; + +#ifdef JIT_LINK_SUPPORTED + ARROW_RETURN_NOT_OK(UseJITLinkIfEnabled(jit_builder)); +#endif + + jit_builder.setJITTargetMachineBuilder(std::move(jtmb)); + if (object_cache.has_value()) { + jit_builder.setCompileFunctionCreator( + [&object_cache](llvm::orc::JITTargetMachineBuilder JTMB) + -> llvm::Expected> { + auto target_machine = JTMB.createTargetMachine(); + if (!target_machine) { + return target_machine.takeError(); + } + // after compilation, the object code will be stored into the given object + // cache + return std::make_unique( + std::move(*target_machine), &object_cache.value().get()); + }); + } + auto maybe_jit = jit_builder.create(); + ARROW_ASSIGN_OR_RAISE(auto jit, + AsArrowResult(maybe_jit, "Could not create LLJIT instance: ")); + + AddProcessSymbol(*jit); + return jit; +} + +Status Engine::SetLLVMObjectCache(GandivaObjectCache& object_cache) { + auto cached_buffer = object_cache.getObject(nullptr); + if (cached_buffer) { + auto error = lljit_->addObjectFile(std::move(cached_buffer)); + if (error) { + return Status::CodeGenError("Failed to add cached object file to LLJIT: ", + llvm::toString(std::move(error))); + } + } + return Status::OK(); +} + void Engine::InitOnce() { DCHECK_EQ(llvm_init, false); @@ -127,28 +263,34 @@ void Engine::InitOnce() { } } ARROW_LOG(INFO) << "Detected CPU Name : " << cpu_name.str(); - ARROW_LOG(INFO) << "Detected CPU Features:" << cpu_attrs_str; + ARROW_LOG(INFO) << "Detected CPU Features: [" << cpu_attrs_str << "]"; llvm_init = true; } Engine::Engine(const std::shared_ptr& conf, - std::unique_ptr ctx, - std::unique_ptr engine, llvm::Module* module, - bool cached) - : context_(std::move(ctx)), - execution_engine_(std::move(engine)), + std::unique_ptr lljit, + std::unique_ptr target_machine, bool cached) + : context_(std::make_unique()), + lljit_(std::move(lljit)), ir_builder_(std::make_unique>(*context_)), - module_(module), types_(*context_), optimize_(conf->optimize()), cached_(cached), - function_registry_(conf->function_registry()) {} + function_registry_(conf->function_registry()), + target_machine_(std::move(target_machine)), + conf_(conf) { + // LLVM 10 doesn't like the expr function name to be the same as the module name + auto module_id = "gdv_module_" + std::to_string(reinterpret_cast(this)); + module_ = std::make_unique(module_id, *context_); +} + +Engine::~Engine() {} Status Engine::Init() { std::call_once(register_exported_funcs_flag, gandiva::RegisterExportedFuncs); + // Add mappings for global functions that can be accessed from LLVM/IR module. ARROW_RETURN_NOT_OK(AddGlobalMappings()); - return Status::OK(); } @@ -163,101 +305,32 @@ Status Engine::LoadFunctionIRs() { } /// factory method to construct the engine. -Status Engine::Make(const std::shared_ptr& conf, bool cached, - std::unique_ptr* out) { +Result> Engine::Make( + const std::shared_ptr& conf, bool cached, + std::optional> object_cache) { std::call_once(llvm_init_once_flag, InitOnce); - auto ctx = std::make_unique(); - auto module = std::make_unique("codegen", *ctx); - - // Capture before moving, ExecutionEngine does not allow retrieving the - // original Module. - auto module_ptr = module.get(); - - auto opt_level = - conf->optimize() ? llvm::CodeGenOpt::Aggressive : llvm::CodeGenOpt::None; - - // Note that the lifetime of the error string is not captured by the - // ExecutionEngine but only for the lifetime of the builder. Found by - // inspecting LLVM sources. - std::string builder_error; - - llvm::EngineBuilder engine_builder(std::move(module)); - - engine_builder.setEngineKind(llvm::EngineKind::JIT) - .setOptLevel(opt_level) - .setErrorStr(&builder_error); - - if (conf->target_host_cpu()) { - engine_builder.setMCPU(cpu_name); - engine_builder.setMAttrs(cpu_attrs); - } - std::unique_ptr exec_engine{engine_builder.create()}; - - if (exec_engine == nullptr) { - return Status::CodeGenError("Could not instantiate llvm::ExecutionEngine: ", - builder_error); - } + ARROW_ASSIGN_OR_RAISE(auto jtmb, MakeTargetMachineBuilder(*conf)); + ARROW_ASSIGN_OR_RAISE(auto jit, BuildJIT(jtmb, object_cache)); + auto maybe_tm = jtmb.createTargetMachine(); + ARROW_ASSIGN_OR_RAISE(auto target_machine, + AsArrowResult(maybe_tm, "Could not create target machine: ")); std::unique_ptr engine{ - new Engine(conf, std::move(ctx), std::move(exec_engine), module_ptr, cached)}; - ARROW_RETURN_NOT_OK(engine->Init()); - *out = std::move(engine); - return Status::OK(); -} - -// This method was modified from its original version for a part of MLIR -// Original source from -// https://github.com/llvm/llvm-project/blob/9f2ce5b915a505a5488a5cf91bb0a8efa9ddfff7/mlir/lib/ExecutionEngine/ExecutionEngine.cpp -// The original copyright notice follows. - -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -static void SetDataLayout(llvm::Module* module) { - auto target_triple = llvm::sys::getDefaultTargetTriple(); - std::string error_message; - auto target = llvm::TargetRegistry::lookupTarget(target_triple, error_message); - if (!target) { - return; - } - - std::string cpu(llvm::sys::getHostCPUName()); - llvm::SubtargetFeatures features; - llvm::StringMap host_features; - - if (llvm::sys::getHostCPUFeatures(host_features)) { - for (auto& f : host_features) { - features.AddFeature(f.first(), f.second); - } - } + new Engine(conf, std::move(jit), std::move(target_machine), cached)}; - std::unique_ptr machine( - target->createTargetMachine(target_triple, cpu, features.getString(), {}, {})); - - module->setDataLayout(machine->createDataLayout()); -} -// end of the modified method from MLIR - -template -static arrow::Result AsArrowResult(llvm::Expected& expected) { - if (!expected) { - std::string str; - llvm::raw_string_ostream stream(str); - stream << expected.takeError(); - return Status::CodeGenError(stream.str()); - } - return std::move(expected.get()); + ARROW_RETURN_NOT_OK(engine->Init()); + return engine; } static arrow::Status VerifyAndLinkModule( - llvm::Module* dest_module, + llvm::Module& dest_module, llvm::Expected> src_module_or_error) { - ARROW_ASSIGN_OR_RAISE(auto src_ir_module, AsArrowResult(src_module_or_error)); + ARROW_ASSIGN_OR_RAISE( + auto src_ir_module, + AsArrowResult(src_module_or_error, "Failed to verify and link module: ")); - // set dataLayout - SetDataLayout(src_ir_module.get()); + src_ir_module->setDataLayout(dest_module.getDataLayout()); std::string error_info; llvm::raw_string_ostream error_stream(error_info); @@ -265,16 +338,21 @@ static arrow::Status VerifyAndLinkModule( llvm::verifyModule(*src_ir_module, &error_stream), Status::CodeGenError("verify of IR Module failed: " + error_stream.str())); - ARROW_RETURN_IF(llvm::Linker::linkModules(*dest_module, std::move(src_ir_module)), + ARROW_RETURN_IF(llvm::Linker::linkModules(dest_module, std::move(src_ir_module)), Status::CodeGenError("failed to link IR Modules")); return Status::OK(); } +llvm::Module* Engine::module() { + DCHECK(!module_finalized_) << "module cannot be accessed after finalized"; + return module_.get(); +} + // Handling for pre-compiled IR libraries. Status Engine::LoadPreCompiledIR() { - auto bitcode = llvm::StringRef(reinterpret_cast(kPrecompiledBitcode), - kPrecompiledBitcodeSize); + auto const bitcode = llvm::StringRef(reinterpret_cast(kPrecompiledBitcode), + kPrecompiledBitcodeSize); /// Read from file into memory buffer. llvm::ErrorOr> buffer_or_error = @@ -291,14 +369,14 @@ Status Engine::LoadPreCompiledIR() { llvm::getOwningLazyBitcodeModule(std::move(buffer), *context()); // NOTE: llvm::handleAllErrors() fails linking with RTTI-disabled LLVM builds // (ARROW-5148) - ARROW_RETURN_NOT_OK(VerifyAndLinkModule(module_, std::move(module_or_error))); + ARROW_RETURN_NOT_OK(VerifyAndLinkModule(*module_, std::move(module_or_error))); return Status::OK(); } static llvm::MemoryBufferRef AsLLVMMemoryBuffer(const arrow::Buffer& arrow_buffer) { - auto data = reinterpret_cast(arrow_buffer.data()); - auto size = arrow_buffer.size(); - return llvm::MemoryBufferRef(llvm::StringRef(data, size), "external_bitcode"); + auto const data = reinterpret_cast(arrow_buffer.data()); + auto const size = arrow_buffer.size(); + return {llvm::StringRef(data, size), "external_bitcode"}; } Status Engine::LoadExternalPreCompiledIR() { @@ -306,7 +384,7 @@ Status Engine::LoadExternalPreCompiledIR() { for (auto const& buffer : buffers) { auto llvm_memory_buffer_ref = AsLLVMMemoryBuffer(*buffer); auto module_or_error = llvm::parseBitcodeFile(llvm_memory_buffer_ref, *context()); - ARROW_RETURN_NOT_OK(VerifyAndLinkModule(module_, std::move(module_or_error))); + ARROW_RETURN_NOT_OK(VerifyAndLinkModule(*module_, std::move(module_or_error))); } return Status::OK(); @@ -386,7 +464,8 @@ static void OptimizeModuleWithLegacyPassManager(llvm::Module& module, std::unique_ptr pass_manager( new llvm::legacy::PassManager()); - pass_manager->add(llvm::createTargetTransformInfoWrapperPass(target_analysis)); + pass_manager->add( + llvm::createTargetTransformInfoWrapperPass(std::move(target_analysis))); pass_manager->add(llvm::createFunctionInliningPass()); pass_manager->add(llvm::createInstructionCombiningPass()); pass_manager->add(llvm::createPromoteMemoryToRegisterPass()); @@ -411,40 +490,64 @@ Status Engine::FinalizeModule() { ARROW_RETURN_NOT_OK(RemoveUnusedFunctions()); if (optimize_) { - auto target_analysis = execution_engine_->getTargetMachine()->getTargetIRAnalysis(); - + auto target_analysis = target_machine_->getTargetIRAnalysis(); // misc passes to allow for inlining, vectorization, .. #if LLVM_VERSION_MAJOR >= 14 - OptimizeModuleWithNewPassManager(*module_, target_analysis); + OptimizeModuleWithNewPassManager(*module_, std::move(target_analysis)); #else - OptimizeModuleWithLegacyPassManager(*module_, target_analysis); + OptimizeModuleWithLegacyPassManager(*module_, std::move(target_analysis)); #endif } ARROW_RETURN_IF(llvm::verifyModule(*module_, &llvm::errs()), Status::CodeGenError("Module verification failed after optimizer")); - } - // do the compilation - execution_engine_->finalizeObject(); + // print the module IR and save it for later use if IR dumping is needed + // since the module will be moved to construct LLJIT instance, and it is not + // available after LLJIT instance is constructed + if (conf_->dump_ir()) { + module_ir_ = DumpModuleIR(*module_); + } + + llvm::orc::ThreadSafeModule tsm(std::move(module_), std::move(context_)); + auto error = lljit_->addIRModule(std::move(tsm)); + if (error) { + return Status::CodeGenError("Failed to add IR module to LLJIT: ", + llvm::toString(std::move(error))); + } + } module_finalized_ = true; return Status::OK(); } -void* Engine::CompiledFunction(std::string& function) { - DCHECK(module_finalized_); - return reinterpret_cast(execution_engine_->getFunctionAddress(function)); +Result Engine::CompiledFunction(const std::string& function) { + DCHECK(module_finalized_) + << "module must be finalized before getting compiled function"; + auto sym = lljit_->lookup(function); + if (!sym) { + return Status::CodeGenError("Failed to look up function: " + function + + " error: " + llvm::toString(sym.takeError())); + } + // Since LLVM 15, `LLJIT::lookup` returns ExecutorAddrs rather than + // JITEvaluatedSymbols +#if LLVM_VERSION_MAJOR >= 15 + auto fn_addr = sym->getValue(); +#else + auto fn_addr = sym->getAddress(); +#endif + auto fn_ptr = reinterpret_cast(fn_addr); + if (fn_ptr == nullptr) { + return Status::CodeGenError("Failed to get address for function: " + function); + } + return fn_ptr; } void Engine::AddGlobalMappingForFunc(const std::string& name, llvm::Type* ret_type, - const std::vector& args, - void* function_ptr) { - constexpr bool is_var_arg = false; - auto prototype = llvm::FunctionType::get(ret_type, args, is_var_arg); - constexpr auto linkage = llvm::GlobalValue::ExternalLinkage; - auto fn = llvm::Function::Create(prototype, linkage, name, module()); - execution_engine_->addGlobalMapping(fn, function_ptr); + const std::vector& args, void* func) { + auto const prototype = llvm::FunctionType::get(ret_type, args, /*is_var_arg*/ false); + llvm::Function::Create(prototype, llvm::GlobalValue::ExternalLinkage, name, module()); + AddAbsoluteSymbol(*lljit_, name, func); } arrow::Status Engine::AddGlobalMappings() { @@ -453,11 +556,9 @@ arrow::Status Engine::AddGlobalMappings() { return c_funcs.AddMappings(this); } -std::string Engine::DumpIR() { - std::string ir; - llvm::raw_string_ostream stream(ir); - module_->print(stream, nullptr); - return ir; +const std::string& Engine::ir() { + DCHECK(!module_ir_.empty()) << "dump_ir in Configuration must be set for dumping IR"; + return module_ir_; } } // namespace gandiva diff --git a/cpp/src/gandiva/engine.h b/cpp/src/gandiva/engine.h index df2d8b36d9260..565c3f142502d 100644 --- a/cpp/src/gandiva/engine.h +++ b/cpp/src/gandiva/engine.h @@ -17,11 +17,16 @@ #pragma once +#include +#include #include +#include #include #include #include +#include + #include "arrow/util/logging.h" #include "arrow/util/macros.h" #include "gandiva/configuration.h" @@ -30,23 +35,34 @@ #include "gandiva/llvm_types.h" #include "gandiva/visibility.h" +namespace llvm::orc { +class LLJIT; +} // namespace llvm::orc + namespace gandiva { /// \brief LLVM Execution engine wrapper. class GANDIVA_EXPORT Engine { public: + ~Engine(); llvm::LLVMContext* context() { return context_.get(); } llvm::IRBuilder<>* ir_builder() { return ir_builder_.get(); } LLVMTypes* types() { return &types_; } - llvm::Module* module() { return module_; } + + /// Retrieve LLVM module in the engine. + /// This should only be called before `FinalizeModule` is called + llvm::Module* module(); /// Factory method to create and initialize the engine object. /// /// \param[in] config the engine configuration /// \param[in] cached flag to mark if the module is already compiled and cached - /// \param[out] engine the created engine - static Status Make(const std::shared_ptr& config, bool cached, - std::unique_ptr* engine); + /// \param[in] object_cache an optional object_cache used for building the module + /// \return arrow::Result containing the created engine + static Result> Make( + const std::shared_ptr& config, bool cached, + std::optional> object_cache = + std::nullopt); /// Add the function to the list of IR functions that need to be compiled. /// Compiling only the functions that are used by the module saves time. @@ -59,36 +75,31 @@ class GANDIVA_EXPORT Engine { Status FinalizeModule(); /// Set LLVM ObjectCache. - void SetLLVMObjectCache(GandivaObjectCache& object_cache) { - execution_engine_->setObjectCache(&object_cache); - } + Status SetLLVMObjectCache(GandivaObjectCache& object_cache); /// Get the compiled function corresponding to the irfunction. - void* CompiledFunction(std::string& function); + Result CompiledFunction(const std::string& function); // Create and add a mapping for the cpp function to make it accessible from LLVM. void AddGlobalMappingForFunc(const std::string& name, llvm::Type* ret_type, const std::vector& args, void* func); /// Return the generated IR for the module. - std::string DumpIR(); + const std::string& ir(); /// Load the function IRs that can be accessed in the module. Status LoadFunctionIRs(); private: Engine(const std::shared_ptr& conf, - std::unique_ptr ctx, - std::unique_ptr engine, llvm::Module* module, - bool cached); + std::unique_ptr lljit, + std::unique_ptr target_machine, bool cached); // Post construction init. This _must_ be called after the constructor. Status Init(); static void InitOnce(); - llvm::ExecutionEngine& execution_engine() { return *execution_engine_; } - /// load pre-compiled IR modules from precompiled_bitcode.cc and merge them into /// the main module. Status LoadPreCompiledIR(); @@ -103,9 +114,9 @@ class GANDIVA_EXPORT Engine { Status RemoveUnusedFunctions(); std::unique_ptr context_; - std::unique_ptr execution_engine_; + std::unique_ptr lljit_; std::unique_ptr> ir_builder_; - llvm::Module* module_; + std::unique_ptr module_; LLVMTypes types_; std::vector functions_to_compile_; @@ -115,6 +126,9 @@ class GANDIVA_EXPORT Engine { bool cached_; bool functions_loaded_ = false; std::shared_ptr function_registry_; + std::string module_ir_; + std::unique_ptr target_machine_; + const std::shared_ptr conf_; }; } // namespace gandiva diff --git a/cpp/src/gandiva/engine_llvm_test.cc b/cpp/src/gandiva/engine_llvm_test.cc index 9baaa82d2e0d3..78f468d13fa1f 100644 --- a/cpp/src/gandiva/engine_llvm_test.cc +++ b/cpp/src/gandiva/engine_llvm_test.cc @@ -24,14 +24,14 @@ namespace gandiva { -typedef int64_t (*add_vector_func_t)(int64_t* data, int n); +using add_vector_func_t = int64_t (*)(int64_t*, int); class TestEngine : public ::testing::Test { protected: - std::string BuildVecAdd(Engine* engine) { - auto types = engine->types(); - llvm::IRBuilder<>* builder = engine->ir_builder(); - llvm::LLVMContext* context = engine->context(); + std::string BuildVecAdd(Engine* gdv_engine) { + auto types = gdv_engine->types(); + llvm::IRBuilder<>* builder = gdv_engine->ir_builder(); + llvm::LLVMContext* context = gdv_engine->context(); // Create fn prototype : // int64_t add_longs(int64_t *elements, int32_t nelements) @@ -42,10 +42,10 @@ class TestEngine : public ::testing::Test { llvm::FunctionType::get(types->i64_type(), arguments, false /*isVarArg*/); // Create fn - std::string func_name = "add_longs"; - engine->AddFunctionToCompile(func_name); + std::string func_name = "add_longs_test_expr"; + gdv_engine->AddFunctionToCompile(func_name); llvm::Function* fn = llvm::Function::Create( - prototype, llvm::GlobalValue::ExternalLinkage, func_name, engine->module()); + prototype, llvm::GlobalValue::ExternalLinkage, func_name, gdv_engine->module()); assert(fn != nullptr); // Name the arguments @@ -99,7 +99,9 @@ class TestEngine : public ::testing::Test { return func_name; } - void BuildEngine() { ASSERT_OK(Engine::Make(TestConfiguration(), false, &engine)); } + void BuildEngine() { + ASSERT_OK_AND_ASSIGN(engine, Engine::Make(TestConfiguration(), false)); + } std::unique_ptr engine; std::shared_ptr configuration = TestConfiguration(); @@ -111,7 +113,8 @@ TEST_F(TestEngine, TestAddUnoptimised) { std::string fn_name = BuildVecAdd(engine.get()); ASSERT_OK(engine->FinalizeModule()); - auto add_func = reinterpret_cast(engine->CompiledFunction(fn_name)); + ASSERT_OK_AND_ASSIGN(auto fn_ptr, engine->CompiledFunction(fn_name)); + auto add_func = reinterpret_cast(fn_ptr); int64_t my_array[] = {1, 3, -5, 8, 10}; EXPECT_EQ(add_func(my_array, 5), 17); @@ -123,7 +126,8 @@ TEST_F(TestEngine, TestAddOptimised) { std::string fn_name = BuildVecAdd(engine.get()); ASSERT_OK(engine->FinalizeModule()); - auto add_func = reinterpret_cast(engine->CompiledFunction(fn_name)); + EXPECT_OK_AND_ASSIGN(auto fn_ptr, engine->CompiledFunction(fn_name)); + auto add_func = reinterpret_cast(fn_ptr); int64_t my_array[] = {1, 3, -5, 8, 10}; EXPECT_EQ(add_func(my_array, 5), 17); diff --git a/cpp/src/gandiva/filter.cc b/cpp/src/gandiva/filter.cc index 416d97b5dbd1d..8a270cfdc06f2 100644 --- a/cpp/src/gandiva/filter.cc +++ b/cpp/src/gandiva/filter.cc @@ -65,8 +65,8 @@ Status Filter::Make(SchemaPtr schema, ConditionPtr condition, GandivaObjectCache obj_cache(cache, cache_key); // Build LLVM generator, and generate code for the specified expression - std::unique_ptr llvm_gen; - ARROW_RETURN_NOT_OK(LLVMGenerator::Make(configuration, is_cached, &llvm_gen)); + ARROW_ASSIGN_OR_RAISE(auto llvm_gen, + LLVMGenerator::Make(configuration, is_cached, obj_cache)); if (!is_cached) { // Run the validation on the expression. @@ -77,7 +77,7 @@ Status Filter::Make(SchemaPtr schema, ConditionPtr condition, } // Set the object cache for LLVM - llvm_gen->SetLLVMObjectCache(obj_cache); + ARROW_RETURN_NOT_OK(llvm_gen->SetLLVMObjectCache(obj_cache)); ARROW_RETURN_NOT_OK(llvm_gen->Build({condition}, SelectionVector::Mode::MODE_NONE)); @@ -119,7 +119,7 @@ Status Filter::Evaluate(const arrow::RecordBatch& batch, return out_selection->PopulateFromBitMap(result, bitmap_size, num_rows - 1); } -std::string Filter::DumpIR() { return llvm_generator_->DumpIR(); } +const std::string& Filter::DumpIR() { return llvm_generator_->ir(); } void Filter::SetBuiltFromCache(bool flag) { built_from_cache_ = flag; } diff --git a/cpp/src/gandiva/filter.h b/cpp/src/gandiva/filter.h index cc536bca1bb3d..b4043d93c857a 100644 --- a/cpp/src/gandiva/filter.h +++ b/cpp/src/gandiva/filter.h @@ -76,7 +76,7 @@ class GANDIVA_EXPORT Filter { Status Evaluate(const arrow::RecordBatch& batch, std::shared_ptr out_selection); - std::string DumpIR(); + const std::string& DumpIR(); void SetBuiltFromCache(bool flag); diff --git a/cpp/src/gandiva/llvm_generator.cc b/cpp/src/gandiva/llvm_generator.cc index 41cbe0ffe3a3a..62ebab08f4d6b 100644 --- a/cpp/src/gandiva/llvm_generator.cc +++ b/cpp/src/gandiva/llvm_generator.cc @@ -42,15 +42,15 @@ LLVMGenerator::LLVMGenerator(bool cached, function_registry_(std::move(function_registry)), enable_ir_traces_(false) {} -Status LLVMGenerator::Make(const std::shared_ptr& config, bool cached, - std::unique_ptr* llvm_generator) { - std::unique_ptr llvmgen_obj( +Result> LLVMGenerator::Make( + const std::shared_ptr& config, bool cached, + std::optional> object_cache) { + std::unique_ptr llvm_generator( new LLVMGenerator(cached, config->function_registry())); - ARROW_RETURN_NOT_OK(Engine::Make(config, cached, &(llvmgen_obj->engine_))); - *llvm_generator = std::move(llvmgen_obj); - - return Status::OK(); + ARROW_ASSIGN_OR_RAISE(llvm_generator->engine_, + Engine::Make(config, cached, object_cache)); + return llvm_generator; } std::shared_ptr>> @@ -62,8 +62,8 @@ LLVMGenerator::GetCache() { return shared_cache; } -void LLVMGenerator::SetLLVMObjectCache(GandivaObjectCache& object_cache) { - engine_->SetLLVMObjectCache(object_cache); +Status LLVMGenerator::SetLLVMObjectCache(GandivaObjectCache& object_cache) { + return engine_->SetLLVMObjectCache(object_cache); } Status LLVMGenerator::Add(const ExpressionPtr expr, const FieldDescriptorPtr output) { @@ -73,7 +73,7 @@ Status LLVMGenerator::Add(const ExpressionPtr expr, const FieldDescriptorPtr out ValueValidityPairPtr value_validity; ARROW_RETURN_NOT_OK(decomposer.Decompose(*expr->root(), &value_validity)); // Generate the IR function for the decomposed expression. - std::unique_ptr compiled_expr(new CompiledExpr(value_validity, output)); + auto compiled_expr = std::make_unique(value_validity, output); std::string fn_name = "expr_" + std::to_string(idx) + "_" + std::to_string(static_cast(selection_vector_mode_)); if (!cached_) { @@ -103,7 +103,8 @@ Status LLVMGenerator::Build(const ExpressionVector& exprs, SelectionVector::Mode // setup the jit functions for each expression. for (auto& compiled_expr : compiled_exprs_) { auto fn_name = compiled_expr->GetFunctionName(mode); - auto jit_fn = reinterpret_cast(engine_->CompiledFunction(fn_name)); + ARROW_ASSIGN_OR_RAISE(auto fn_ptr, engine_->CompiledFunction(fn_name)); + auto jit_fn = reinterpret_cast(fn_ptr); compiled_expr->SetJITFunction(selection_vector_mode_, jit_fn); } diff --git a/cpp/src/gandiva/llvm_generator.h b/cpp/src/gandiva/llvm_generator.h index 250ab78fbfe28..0c532998e8b83 100644 --- a/cpp/src/gandiva/llvm_generator.h +++ b/cpp/src/gandiva/llvm_generator.h @@ -18,7 +18,9 @@ #pragma once #include +#include #include +#include #include #include @@ -47,15 +49,17 @@ class FunctionHolder; class GANDIVA_EXPORT LLVMGenerator { public: /// \brief Factory method to initialize the generator. - static Status Make(const std::shared_ptr& config, bool cached, - std::unique_ptr* llvm_generator); + static Result> Make( + const std::shared_ptr& config, bool cached, + std::optional> object_cache = + std::nullopt); /// \brief Get the cache to be used for LLVM ObjectCache. static std::shared_ptr>> GetCache(); /// \brief Set LLVM ObjectCache. - void SetLLVMObjectCache(GandivaObjectCache& object_cache); + Status SetLLVMObjectCache(GandivaObjectCache& object_cache); /// \brief Build the code for the expression trees for default mode with a LLVM /// ObjectCache. Each element in the vector represents an expression tree @@ -79,7 +83,7 @@ class GANDIVA_EXPORT LLVMGenerator { SelectionVector::Mode selection_vector_mode() { return selection_vector_mode_; } LLVMTypes* types() { return engine_->types(); } llvm::Module* module() { return engine_->module(); } - std::string DumpIR() { return engine_->DumpIR(); } + const std::string& ir() { return engine_->ir(); } private: explicit LLVMGenerator(bool cached, diff --git a/cpp/src/gandiva/llvm_generator_test.cc b/cpp/src/gandiva/llvm_generator_test.cc index 853d8ae6c3b8d..79654e7b78c7e 100644 --- a/cpp/src/gandiva/llvm_generator_test.cc +++ b/cpp/src/gandiva/llvm_generator_test.cc @@ -47,8 +47,7 @@ class TestLLVMGenerator : public ::testing::Test { auto external_registry = std::make_shared(); auto config = config_factory(std::move(external_registry)); - std::unique_ptr generator; - ASSERT_OK(LLVMGenerator::Make(config, false, &generator)); + ASSERT_OK_AND_ASSIGN(auto generator, LLVMGenerator::Make(config, false)); auto module = generator->module(); ASSERT_OK(generator->engine_->LoadFunctionIRs()); @@ -58,8 +57,7 @@ class TestLLVMGenerator : public ::testing::Test { // Verify that a valid pc function exists for every function in the registry. TEST_F(TestLLVMGenerator, VerifyPCFunctions) { - std::unique_ptr generator; - ASSERT_OK(LLVMGenerator::Make(TestConfiguration(), false, &generator)); + ASSERT_OK_AND_ASSIGN(auto generator, LLVMGenerator::Make(TestConfiguration(), false)); llvm::Module* module = generator->module(); ASSERT_OK(generator->engine_->LoadFunctionIRs()); @@ -70,8 +68,8 @@ TEST_F(TestLLVMGenerator, VerifyPCFunctions) { TEST_F(TestLLVMGenerator, TestAdd) { // Setup LLVM generator to do an arithmetic add of two vectors - std::unique_ptr generator; - ASSERT_OK(LLVMGenerator::Make(TestConfiguration(), false, &generator)); + ASSERT_OK_AND_ASSIGN(auto generator, + LLVMGenerator::Make(TestConfigWithIrDumping(), false)); Annotator annotator; auto field0 = std::make_shared("f0", arrow::int32()); @@ -100,18 +98,22 @@ TEST_F(TestLLVMGenerator, TestAdd) { auto field_sum = std::make_shared("out", arrow::int32()); auto desc_sum = annotator.CheckAndAddInputFieldDescriptor(field_sum); - std::string fn_name = "codegen"; + // LLVM 10 doesn't like the expr function name to be the same as the module name when + // LLJIT is used + std::string fn_name = "llvm_gen_test_add_expr"; ASSERT_OK(generator->engine_->LoadFunctionIRs()); ASSERT_OK(generator->CodeGenExprValue(func_dex, 4, desc_sum, 0, fn_name, SelectionVector::MODE_NONE)); ASSERT_OK(generator->engine_->FinalizeModule()); - auto ir = generator->engine_->DumpIR(); + auto const& ir = generator->engine_->ir(); EXPECT_THAT(ir, testing::HasSubstr("vector.body")); - EvalFunc eval_func = (EvalFunc)generator->engine_->CompiledFunction(fn_name); + ASSERT_OK_AND_ASSIGN(auto fn_ptr, generator->engine_->CompiledFunction(fn_name)); + ASSERT_TRUE(fn_ptr); + auto eval_func = reinterpret_cast(fn_ptr); constexpr size_t kNumRecords = 4; std::array a0{1, 2, 3, 4}; std::array a1{5, 6, 7, 8}; @@ -126,6 +128,7 @@ TEST_F(TestLLVMGenerator, TestAdd) { reinterpret_cast(out.data()), reinterpret_cast(&out_bitmap), }; std::array addr_offsets{0, 0, 0, 0, 0, 0}; + eval_func(addrs.data(), addr_offsets.data(), nullptr, nullptr, nullptr, 0 /* dummy context ptr */, kNumRecords); diff --git a/cpp/src/gandiva/projector.cc b/cpp/src/gandiva/projector.cc index e717e825dfc71..ec0302146fff5 100644 --- a/cpp/src/gandiva/projector.cc +++ b/cpp/src/gandiva/projector.cc @@ -80,8 +80,8 @@ Status Projector::Make(SchemaPtr schema, const ExpressionVector& exprs, GandivaObjectCache obj_cache(cache, cache_key); // Build LLVM generator, and generate code for the specified expressions - std::unique_ptr llvm_gen; - ARROW_RETURN_NOT_OK(LLVMGenerator::Make(configuration, is_cached, &llvm_gen)); + ARROW_ASSIGN_OR_RAISE(auto llvm_gen, + LLVMGenerator::Make(configuration, is_cached, obj_cache)); // Run the validation on the expressions. // Return if any of the expression is invalid since @@ -95,7 +95,7 @@ Status Projector::Make(SchemaPtr schema, const ExpressionVector& exprs, } // Set the object cache for LLVM - llvm_gen->SetLLVMObjectCache(obj_cache); + ARROW_RETURN_NOT_OK(llvm_gen->SetLLVMObjectCache(obj_cache)); ARROW_RETURN_NOT_OK(llvm_gen->Build(exprs, selection_vector_mode)); @@ -281,7 +281,7 @@ Status Projector::ValidateArrayDataCapacity(const arrow::ArrayData& array_data, return Status::OK(); } -std::string Projector::DumpIR() { return llvm_generator_->DumpIR(); } +const std::string& Projector::DumpIR() { return llvm_generator_->ir(); } void Projector::SetBuiltFromCache(bool flag) { built_from_cache_ = flag; } diff --git a/cpp/src/gandiva/projector.h b/cpp/src/gandiva/projector.h index 6801a7c9f3f3c..f1ae7e4dc8ccd 100644 --- a/cpp/src/gandiva/projector.h +++ b/cpp/src/gandiva/projector.h @@ -118,7 +118,7 @@ class GANDIVA_EXPORT Projector { const SelectionVector* selection_vector, const ArrayDataVector& output) const; - std::string DumpIR(); + const std::string& DumpIR(); void SetBuiltFromCache(bool flag); diff --git a/cpp/src/gandiva/tests/micro_benchmarks.cc b/cpp/src/gandiva/tests/micro_benchmarks.cc index f126b769b2010..450e691323cae 100644 --- a/cpp/src/gandiva/tests/micro_benchmarks.cc +++ b/cpp/src/gandiva/tests/micro_benchmarks.cc @@ -16,6 +16,7 @@ // under the License. #include + #include "arrow/memory_pool.h" #include "arrow/status.h" #include "arrow/testing/gtest_util.h" @@ -420,6 +421,35 @@ static void DoDecimalAdd2(benchmark::State& state, int32_t precision, int32_t sc ASSERT_OK(status); } +static void TimedTestExprCompilation(benchmark::State& state) { + int64_t iteration = 0; + for (auto _ : state) { + // schema for input fields + auto field0 = field("f0", int64()); + auto field1 = field("f1", int64()); + auto literal = TreeExprBuilder::MakeLiteral(iteration); + auto schema = arrow::schema({field0, field1}); + + // output field + auto field_add = field("c1", int64()); + auto field_less_than = field("c2", boolean()); + + // Build expression + auto add_func = TreeExprBuilder::MakeFunction( + "add", {TreeExprBuilder::MakeField(field0), literal}, int64()); + auto less_than_func = TreeExprBuilder::MakeFunction( + "less_than", {TreeExprBuilder::MakeField(field1), literal}, boolean()); + + auto expr_0 = TreeExprBuilder::MakeExpression(add_func, field_add); + auto expr_1 = TreeExprBuilder::MakeExpression(less_than_func, field_less_than); + + std::shared_ptr projector; + ASSERT_OK(Projector::Make(schema, {expr_0, expr_1}, TestConfiguration(), &projector)); + + ++iteration; + } +} + static void DecimalAdd2Fast(benchmark::State& state) { // use lesser precision to test the fast-path DoDecimalAdd2(state, DecimalTypeUtil::kMaxPrecision - 6, 18); @@ -460,6 +490,7 @@ static void DecimalAdd3Large(benchmark::State& state) { DoDecimalAdd3(state, DecimalTypeUtil::kMaxPrecision, 18, true); } +BENCHMARK(TimedTestExprCompilation)->Unit(benchmark::kMicrosecond); BENCHMARK(TimedTestAdd3)->Unit(benchmark::kMicrosecond); BENCHMARK(TimedTestBigNested)->Unit(benchmark::kMicrosecond); BENCHMARK(TimedTestExtractYear)->Unit(benchmark::kMicrosecond); diff --git a/cpp/src/gandiva/tests/test_util.cc b/cpp/src/gandiva/tests/test_util.cc index 959ea3cd7a446..2ee49ffae0ed6 100644 --- a/cpp/src/gandiva/tests/test_util.cc +++ b/cpp/src/gandiva/tests/test_util.cc @@ -30,6 +30,10 @@ std::shared_ptr TestConfiguration() { return ConfigurationBuilder::DefaultConfiguration(); } +std::shared_ptr TestConfigWithIrDumping() { + return ConfigurationBuilder().build_with_ir_dumping(true); +} + #ifndef GANDIVA_EXTENSION_TEST_DIR #define GANDIVA_EXTENSION_TEST_DIR "." #endif diff --git a/cpp/src/gandiva/tests/test_util.h b/cpp/src/gandiva/tests/test_util.h index 69d63732aeeaa..d8181fe67516c 100644 --- a/cpp/src/gandiva/tests/test_util.h +++ b/cpp/src/gandiva/tests/test_util.h @@ -98,6 +98,8 @@ static inline ArrayPtr MakeArrowTypeArray(const std::shared_ptr std::shared_ptr TestConfiguration(); +std::shared_ptr TestConfigWithIrDumping(); + // helper function to create a Configuration with an external function registered to the // given function registry std::shared_ptr TestConfigWithFunctionRegistry( diff --git a/python/pyarrow/gandiva.pyx b/python/pyarrow/gandiva.pyx index 35bbf5018f08a..2202ec64f2962 100644 --- a/python/pyarrow/gandiva.pyx +++ b/python/pyarrow/gandiva.pyx @@ -36,6 +36,7 @@ from pyarrow.includes.libgandiva cimport ( CNode, CProjector, CFilter, CSelectionVector, _ensure_selection_mode, + CConfiguration, CConfigurationBuilder, TreeExprBuilder_MakeExpression, TreeExprBuilder_MakeFunction, @@ -583,9 +584,47 @@ cdef class TreeExprBuilder(_Weakrefable): condition.node) return Condition.create(r) +cdef class Configuration(_Weakrefable): + cdef: + shared_ptr[CConfiguration] configuration + + def __cinit__(self, bint optimize=True, bint dump_ir=False): + """ + Initialize the configuration with specified options. + + Parameters + ---------- + optimize : bool, default True + Whether to enable optimizations. + dump_ir : bool, default False + Whether to dump LLVM IR. + """ + self.configuration = CConfigurationBuilder().build() + self.configuration.get().set_optimize(optimize) + self.configuration.get().set_dump_ir(dump_ir) + + @staticmethod + cdef create(shared_ptr[CConfiguration] configuration): + """ + Create a Configuration instance from an existing CConfiguration pointer. + + Parameters + ---------- + configuration : shared_ptr[CConfiguration] + Existing CConfiguration pointer. + + Returns + ------- + Configuration instance + """ + cdef Configuration self = Configuration.__new__(Configuration) + self.configuration = configuration + return self + cpdef make_projector(Schema schema, children, MemoryPool pool, - str selection_mode="NONE"): + str selection_mode="NONE", + Configuration configuration=None): """ Construct a projection using expressions. @@ -602,6 +641,8 @@ cpdef make_projector(Schema schema, children, MemoryPool pool, Memory pool used to allocate output arrays. selection_mode : str, default "NONE" Possible values are NONE, UINT16, UINT32, UINT64. + configuration : pyarrow.gandiva.Configuration, default None + Configuration for the projector. Returns ------- @@ -612,6 +653,9 @@ cpdef make_projector(Schema schema, children, MemoryPool pool, c_vector[shared_ptr[CGandivaExpression]] c_children shared_ptr[CProjector] result + if configuration is None: + configuration = Configuration() + for child in children: if child is None: raise TypeError("Expressions must not be None") @@ -620,12 +664,13 @@ cpdef make_projector(Schema schema, children, MemoryPool pool, check_status( Projector_Make(schema.sp_schema, c_children, _ensure_selection_mode(selection_mode), - CConfigurationBuilder.DefaultConfiguration(), + configuration.configuration, &result)) return Projector.create(result, pool) -cpdef make_filter(Schema schema, Condition condition): +cpdef make_filter(Schema schema, Condition condition, + Configuration configuration=None): """ Construct a filter based on a condition. @@ -638,6 +683,8 @@ cpdef make_filter(Schema schema, Condition condition): Schema for the record batches, and the condition. condition : pyarrow.gandiva.Condition Filter condition. + configuration : pyarrow.gandiva.Configuration, default None + Configuration for the filter. Returns ------- @@ -646,8 +693,12 @@ cpdef make_filter(Schema schema, Condition condition): cdef shared_ptr[CFilter] result if condition is None: raise TypeError("Condition must not be None") + + if configuration is None: + configuration = Configuration() + check_status( - Filter_Make(schema.sp_schema, condition.condition, &result)) + Filter_Make(schema.sp_schema, condition.condition, configuration.configuration, &result)) return Filter.create(result) diff --git a/python/pyarrow/includes/libgandiva.pxd b/python/pyarrow/includes/libgandiva.pxd index fa3b72bad61be..7d76576bef2b9 100644 --- a/python/pyarrow/includes/libgandiva.pxd +++ b/python/pyarrow/includes/libgandiva.pxd @@ -252,6 +252,7 @@ cdef extern from "gandiva/filter.h" namespace "gandiva" nogil: cdef CStatus Filter_Make \ "gandiva::Filter::Make"( shared_ptr[CSchema] schema, shared_ptr[CCondition] condition, + shared_ptr[CConfiguration] configuration, shared_ptr[CFilter]* filter) cdef extern from "gandiva/function_signature.h" namespace "gandiva" nogil: @@ -278,9 +279,20 @@ cdef extern from "gandiva/expression_registry.h" namespace "gandiva" nogil: cdef extern from "gandiva/configuration.h" namespace "gandiva" nogil: cdef cppclass CConfiguration" gandiva::Configuration": - pass + + CConfiguration() + + CConfiguration(bint optimize, bint dump_ir) + + void set_optimize(bint optimize) + + void set_dump_ir(bint dump_ir) cdef cppclass CConfigurationBuilder \ " gandiva::ConfigurationBuilder": @staticmethod shared_ptr[CConfiguration] DefaultConfiguration() + + CConfigurationBuilder() + + shared_ptr[CConfiguration] build() diff --git a/python/pyarrow/tests/test_gandiva.py b/python/pyarrow/tests/test_gandiva.py index 241cac4d83db4..80d119a48530d 100644 --- a/python/pyarrow/tests/test_gandiva.py +++ b/python/pyarrow/tests/test_gandiva.py @@ -47,8 +47,9 @@ def test_tree_exp_builder(): assert expr.result().type == pa.int32() + config = gandiva.Configuration(dump_ir=True) projector = gandiva.make_projector( - schema, [expr], pa.default_memory_pool()) + schema, [expr], pa.default_memory_pool(), "NONE", config) # Gandiva generates compute kernel function named `@expr_X` assert projector.llvm_ir.find("@expr_") != -1 @@ -104,7 +105,8 @@ def test_filter(): assert condition.result().type == pa.bool_() - filter = gandiva.make_filter(table.schema, condition) + config = gandiva.Configuration(dump_ir=True) + filter = gandiva.make_filter(table.schema, condition, config) # Gandiva generates compute kernel function named `@expr_X` assert filter.llvm_ir.find("@expr_") != -1 From 6c3972651e2dfa874f9bc38791de329bcdd78ecd Mon Sep 17 00:00:00 2001 From: Tammy DiPrima Date: Thu, 4 Jan 2024 16:18:22 -0500 Subject: [PATCH 130/570] GH-39114: [JS] Fix Example Code (#39442) --- js/examples/read_file.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/js/examples/read_file.html b/js/examples/read_file.html index 1013fbe79ef65..cd4d58f542756 100644 --- a/js/examples/read_file.html +++ b/js/examples/read_file.html @@ -41,7 +41,7 @@ } reader.onload = function (evt) { - var arrowTable = Arrow.Table.from([new Uint8Array(evt.target.result)]); + var arrowTable = Arrow.tableFromIPC(evt.target.result); var thead = document.getElementById("thead"); var tbody = document.getElementById("tbody"); From 7b0c6f955675c9ad309afc5f82da1623f9b13a59 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 4 Jan 2024 21:04:25 -0300 Subject: [PATCH 131/570] GH-39384: [C++] Disable -Werror=attributes for Azure SDK's identity.hpp (#39448) ### Rationale for this change Warnings in included headers are causing -Werror builds to fail. ### What changes are included in this PR? Push and pop of ignore warning pragmas. ### Are these changes tested? I'm asking @ anjakefala to test the build on this branch. * Closes: #39384 Authored-by: Felipe Oliveira Carvalho Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/filesystem/azurefs.cc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 21350a490411a..029e19bc0e32a 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -18,7 +18,16 @@ #include "arrow/filesystem/azurefs.h" #include "arrow/filesystem/azurefs_internal.h" +// idenfity.hpp triggers -Wattributes warnings cause -Werror builds to fail, +// so disable it for this file with pragmas. +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wattributes" +#endif #include +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif #include #include From bec03856799a69bf0e6d4419ab7bc565afd070fe Mon Sep 17 00:00:00 2001 From: Jinpeng Date: Thu, 4 Jan 2024 21:41:01 -0500 Subject: [PATCH 132/570] PARQUET-2411: [C++][Parquet] Allow reading dictionary without reading data via ByteArrayDictionaryRecordReader (#39153) ### Rationale for this change This proposes an API to read only the dictionary from ByteArrayDictionaryRecordReader, enabling possible uses cases where the caller just want to check the dictionary content. ### What changes are included in this PR? New APIs to enable reading dictionary with RecordReader. ### Are these changes tested? Unit tests. ### Are there any user-facing changes? New APIs without breaking existing workflow. Authored-by: jp0317 Signed-off-by: mwish --- cpp/src/parquet/column_reader.cc | 20 +++++ cpp/src/parquet/column_reader.h | 10 +++ cpp/src/parquet/file_reader.cc | 79 +++++++++++-------- cpp/src/parquet/file_reader.h | 15 +++- cpp/src/parquet/reader_test.cc | 127 +++++++++++++++++++++++++++++++ 5 files changed, 217 insertions(+), 34 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index a49e58afbdb83..99978e283b46a 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -1370,6 +1370,26 @@ class TypedRecordReader : public TypedColumnReaderImpl, return bytes_for_values; } + const void* ReadDictionary(int32_t* dictionary_length) override { + if (this->current_decoder_ == nullptr && !this->HasNextInternal()) { + dictionary_length = 0; + return nullptr; + } + // Verify the current data page is dictionary encoded. The current_encoding_ should + // have been set as RLE_DICTIONARY if the page encoding is RLE_DICTIONARY or + // PLAIN_DICTIONARY. + if (this->current_encoding_ != Encoding::RLE_DICTIONARY) { + std::stringstream ss; + ss << "Data page is not dictionary encoded. Encoding: " + << EncodingToString(this->current_encoding_); + throw ParquetException(ss.str()); + } + auto decoder = dynamic_cast*>(this->current_decoder_); + const T* dictionary = nullptr; + decoder->GetDictionary(&dictionary, dictionary_length); + return reinterpret_cast(dictionary); + } + int64_t ReadRecords(int64_t num_records) override { if (num_records == 0) return 0; // Delimit records, then read values at the end diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index 334b8bcffe0b8..086f6c0e55806 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -368,6 +368,16 @@ class PARQUET_EXPORT RecordReader { virtual void DebugPrintState() = 0; + /// \brief Returns the dictionary owned by the current decoder. Throws an + /// exception if the current decoder is not for dictionary encoding. The caller is + /// responsible for casting the returned pointer to proper type depending on the + /// column's physical type. An example: + /// const ByteArray* dict = reinterpret_cast(ReadDictionary(&len)); + /// or: + /// const float* dict = reinterpret_cast(ReadDictionary(&len)); + /// \param[out] dictionary_length The number of dictionary entries. + virtual const void* ReadDictionary(int32_t* dictionary_length) = 0; + /// \brief Decoded definition levels int16_t* def_levels() const { return reinterpret_cast(def_levels_->mutable_data()); diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 1d972b78fb99c..b3dd1d6054ac8 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -54,6 +54,36 @@ using arrow::internal::AddWithOverflow; namespace parquet { +namespace { +bool IsColumnChunkFullyDictionaryEncoded(const ColumnChunkMetaData& col) { + // Check the encoding_stats to see if all data pages are dictionary encoded. + const std::vector& encoding_stats = col.encoding_stats(); + if (encoding_stats.empty()) { + // Some parquet files may have empty encoding_stats. In this case we are + // not sure whether all data pages are dictionary encoded. + return false; + } + // The 1st page should be the dictionary page. + if (encoding_stats[0].page_type != PageType::DICTIONARY_PAGE || + (encoding_stats[0].encoding != Encoding::PLAIN && + encoding_stats[0].encoding != Encoding::PLAIN_DICTIONARY)) { + return false; + } + // The following pages should be dictionary encoded data pages. + for (size_t idx = 1; idx < encoding_stats.size(); ++idx) { + if ((encoding_stats[idx].encoding != Encoding::RLE_DICTIONARY && + encoding_stats[idx].encoding != Encoding::PLAIN_DICTIONARY) || + (encoding_stats[idx].page_type != PageType::DATA_PAGE && + encoding_stats[idx].page_type != PageType::DATA_PAGE_V2)) { + // Return false if any following page is not a dictionary encoded data + // page. + return false; + } + } + return true; +} +} // namespace + // PARQUET-978: Minimize footer reads by reading 64 KB from the end of the file static constexpr int64_t kDefaultFooterReadSize = 64 * 1024; static constexpr uint32_t kFooterSize = 8; @@ -82,7 +112,8 @@ std::shared_ptr RowGroupReader::Column(int i) { const_cast(contents_->properties())->memory_pool()); } -std::shared_ptr RowGroupReader::RecordReader(int i) { +std::shared_ptr RowGroupReader::RecordReader( + int i, bool read_dictionary) { if (i >= metadata()->num_columns()) { std::stringstream ss; ss << "Trying to read column index " << i << " but row group metadata has only " @@ -96,8 +127,8 @@ std::shared_ptr RowGroupReader::RecordReader(int i) { internal::LevelInfo level_info = internal::LevelInfo::ComputeLevelInfo(descr); auto reader = internal::RecordReader::Make( - descr, level_info, contents_->properties()->memory_pool(), - /* read_dictionary = */ false, contents_->properties()->read_dense_for_nullable()); + descr, level_info, contents_->properties()->memory_pool(), read_dictionary, + contents_->properties()->read_dense_for_nullable()); reader->SetPageReader(std::move(page_reader)); return reader; } @@ -106,41 +137,23 @@ std::shared_ptr RowGroupReader::ColumnWithExposeEncoding( int i, ExposedEncoding encoding_to_expose) { std::shared_ptr reader = Column(i); - if (encoding_to_expose == ExposedEncoding::DICTIONARY) { - // Check the encoding_stats to see if all data pages are dictionary encoded. - std::unique_ptr col = metadata()->ColumnChunk(i); - const std::vector& encoding_stats = col->encoding_stats(); - if (encoding_stats.empty()) { - // Some parquet files may have empty encoding_stats. In this case we are - // not sure whether all data pages are dictionary encoded. So we do not - // enable exposing dictionary. - return reader; - } - // The 1st page should be the dictionary page. - if (encoding_stats[0].page_type != PageType::DICTIONARY_PAGE || - (encoding_stats[0].encoding != Encoding::PLAIN && - encoding_stats[0].encoding != Encoding::PLAIN_DICTIONARY)) { - return reader; - } - // The following pages should be dictionary encoded data pages. - for (size_t idx = 1; idx < encoding_stats.size(); ++idx) { - if ((encoding_stats[idx].encoding != Encoding::RLE_DICTIONARY && - encoding_stats[idx].encoding != Encoding::PLAIN_DICTIONARY) || - (encoding_stats[idx].page_type != PageType::DATA_PAGE && - encoding_stats[idx].page_type != PageType::DATA_PAGE_V2)) { - return reader; - } - } - } else { - // Exposing other encodings are not supported for now. - return reader; + if (encoding_to_expose == ExposedEncoding::DICTIONARY && + IsColumnChunkFullyDictionaryEncoded(*metadata()->ColumnChunk(i))) { + // Set exposed encoding. + reader->SetExposedEncoding(encoding_to_expose); } - // Set exposed encoding. - reader->SetExposedEncoding(encoding_to_expose); return reader; } +std::shared_ptr RowGroupReader::RecordReaderWithExposeEncoding( + int i, ExposedEncoding encoding_to_expose) { + return RecordReader( + i, + /*read_dictionary=*/encoding_to_expose == ExposedEncoding::DICTIONARY && + IsColumnChunkFullyDictionaryEncoded(*metadata()->ColumnChunk(i))); +} + std::unique_ptr RowGroupReader::GetColumnPageReader(int i) { if (i >= metadata()->num_columns()) { std::stringstream ss; diff --git a/cpp/src/parquet/file_reader.h b/cpp/src/parquet/file_reader.h index da85b73fc2dfe..b59b59f95c2d8 100644 --- a/cpp/src/parquet/file_reader.h +++ b/cpp/src/parquet/file_reader.h @@ -64,7 +64,8 @@ class PARQUET_EXPORT RowGroupReader { // EXPERIMENTAL: Construct a RecordReader for the indicated column of the row group. // Ownership is shared with the RowGroupReader. - std::shared_ptr RecordReader(int i); + std::shared_ptr RecordReader(int i, + bool read_dictionary = false); // Construct a ColumnReader, trying to enable exposed encoding. // @@ -80,6 +81,18 @@ class PARQUET_EXPORT RowGroupReader { std::shared_ptr ColumnWithExposeEncoding( int i, ExposedEncoding encoding_to_expose); + // Construct a RecordReader, trying to enable exposed encoding. + // + // For dictionary encoding, currently we only support column chunks that are + // fully dictionary encoded byte arrays. The caller should verify if the reader can read + // and expose the dictionary by checking the reader's read_dictionary(). If a column + // chunk uses dictionary encoding but then falls back to plain encoding, the returned + // reader will read decoded data without exposing the dictionary. + // + // \note API EXPERIMENTAL + std::shared_ptr RecordReaderWithExposeEncoding( + int i, ExposedEncoding encoding_to_expose); + std::unique_ptr GetColumnPageReader(int i); private: diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc index 5223158e5f4f9..2c2b62f5d12f6 100644 --- a/cpp/src/parquet/reader_test.cc +++ b/cpp/src/parquet/reader_test.cc @@ -542,6 +542,83 @@ TEST(TestFileReader, GetRecordReader) { ASSERT_EQ(8, col_record_reader_->levels_written()); } +TEST(TestFileReader, RecordReaderWithExposingDictionary) { + const int num_rows = 1000; + + // Make schema + schema::NodeVector fields; + fields.push_back(PrimitiveNode::Make("field", Repetition::REQUIRED, Type::BYTE_ARRAY, + ConvertedType::NONE)); + auto schema = std::static_pointer_cast( + GroupNode::Make("schema", Repetition::REQUIRED, fields)); + + // Write small batches and small data pages + std::shared_ptr writer_props = WriterProperties::Builder() + .write_batch_size(64) + ->data_pagesize(128) + ->enable_dictionary() + ->build(); + + ASSERT_OK_AND_ASSIGN(auto out_file, ::arrow::io::BufferOutputStream::Create()); + std::shared_ptr file_writer = + ParquetFileWriter::Open(out_file, schema, writer_props); + + RowGroupWriter* rg_writer = file_writer->AppendRowGroup(); + + // write one column + ::arrow::random::RandomArrayGenerator rag(0); + ByteArrayWriter* writer = static_cast(rg_writer->NextColumn()); + std::vector raw_unique_data = {"a", "bc", "defg"}; + std::vector col_typed; + for (int i = 0; i < num_rows; i++) { + std::string_view chosed_data = raw_unique_data[i % raw_unique_data.size()]; + col_typed.emplace_back(chosed_data); + } + writer->WriteBatch(num_rows, nullptr, nullptr, col_typed.data()); + rg_writer->Close(); + file_writer->Close(); + + // Open the reader + ASSERT_OK_AND_ASSIGN(auto file_buf, out_file->Finish()); + auto in_file = std::make_shared<::arrow::io::BufferReader>(file_buf); + + ReaderProperties reader_props; + reader_props.enable_buffered_stream(); + reader_props.set_buffer_size(64); + std::unique_ptr file_reader = + ParquetFileReader::Open(in_file, reader_props); + + auto row_group = file_reader->RowGroup(0); + auto record_reader = std::dynamic_pointer_cast( + row_group->RecordReaderWithExposeEncoding(0, ExposedEncoding::DICTIONARY)); + ASSERT_NE(record_reader, nullptr); + ASSERT_TRUE(record_reader->read_dictionary()); + + int32_t dict_len = 0; + auto dict = + reinterpret_cast(record_reader->ReadDictionary(&dict_len)); + ASSERT_NE(dict, nullptr); + ASSERT_EQ(dict_len, raw_unique_data.size()); + ASSERT_EQ(record_reader->ReadRecords(num_rows), num_rows); + std::shared_ptr<::arrow::ChunkedArray> result_array = record_reader->GetResult(); + ASSERT_EQ(result_array->num_chunks(), 1); + const std::shared_ptr<::arrow::Array> chunk = result_array->chunk(0); + auto dictionary_array = std::dynamic_pointer_cast<::arrow::DictionaryArray>(chunk); + const int32_t* indices = + (std::dynamic_pointer_cast<::arrow::Int32Array>(dictionary_array->indices())) + ->raw_values(); + + // Verify values based on the dictionary from ReadDictionary(). + int64_t indices_read = chunk->length(); + ASSERT_EQ(indices_read, num_rows); + for (int i = 0; i < indices_read; ++i) { + ASSERT_LT(indices[i], dict_len); + ASSERT_EQ(std::string_view(reinterpret_cast(dict[indices[i]].ptr), + dict[indices[i]].len), + col_typed[i]); + } +} + class TestLocalFile : public ::testing::Test { public: void SetUp() { @@ -1064,6 +1141,56 @@ TEST(TestFileReader, BufferedReadsWithDictionary) { } } +TEST(TestFileReader, PartiallyDictionaryEncodingNotExposed) { + const int num_rows = 1000; + + // Make schema + schema::NodeVector fields; + fields.push_back(PrimitiveNode::Make("field", Repetition::REQUIRED, Type::DOUBLE, + ConvertedType::NONE)); + auto schema = std::static_pointer_cast( + GroupNode::Make("schema", Repetition::REQUIRED, fields)); + + // Write small batches and small data pages. Explicitly set the dictionary page size + // limit such that the column chunk will not be fully dictionary encoded. + std::shared_ptr writer_props = WriterProperties::Builder() + .write_batch_size(64) + ->data_pagesize(128) + ->enable_dictionary() + ->dictionary_pagesize_limit(4) + ->build(); + + ASSERT_OK_AND_ASSIGN(auto out_file, ::arrow::io::BufferOutputStream::Create()); + std::shared_ptr file_writer = + ParquetFileWriter::Open(out_file, schema, writer_props); + + RowGroupWriter* rg_writer = file_writer->AppendRowGroup(); + + // write one column + ::arrow::random::RandomArrayGenerator rag(0); + DoubleWriter* writer = static_cast(rg_writer->NextColumn()); + std::shared_ptr<::arrow::Array> col = rag.Float64(num_rows, 0, 100); + const auto& col_typed = static_cast(*col); + writer->WriteBatch(num_rows, nullptr, nullptr, col_typed.raw_values()); + rg_writer->Close(); + file_writer->Close(); + + // Open the reader + ASSERT_OK_AND_ASSIGN(auto file_buf, out_file->Finish()); + auto in_file = std::make_shared<::arrow::io::BufferReader>(file_buf); + + ReaderProperties reader_props; + reader_props.enable_buffered_stream(); + reader_props.set_buffer_size(64); + std::unique_ptr file_reader = + ParquetFileReader::Open(in_file, reader_props); + + auto row_group = file_reader->RowGroup(0); + auto col_reader = std::static_pointer_cast( + row_group->ColumnWithExposeEncoding(0, ExposedEncoding::DICTIONARY)); + EXPECT_NE(col_reader->GetExposedEncoding(), ExposedEncoding::DICTIONARY); +} + TEST(TestFileReader, BufferedReads) { // PARQUET-1636: Buffered reads were broken before introduction of // RandomAccessFile::GetStream From 04d79846dc5fff606dd66407c5479e087185b35a Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 5 Jan 2024 23:04:31 +0900 Subject: [PATCH 133/570] GH-39433: [Ruby] Add support for Table.load(format: json) options (#39464) ### Rationale for this change Other `format:` such as `format: :csv` accepts custom options. `format: :json` should also accept them. ### What changes are included in this PR? Use `Arrow::JSONReadOptions` for `Table::Load(format: :json)`. ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. * Closes: #39433 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- ruby/red-arrow/lib/arrow/table-loader.rb | 8 +++++++- ruby/red-arrow/test/helper.rb | 1 + ruby/red-arrow/test/test-table.rb | 25 ++++++++++++++++++++++++ 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/ruby/red-arrow/lib/arrow/table-loader.rb b/ruby/red-arrow/lib/arrow/table-loader.rb index 308eb16a37ad0..450be3fbe09ff 100644 --- a/ruby/red-arrow/lib/arrow/table-loader.rb +++ b/ruby/red-arrow/lib/arrow/table-loader.rb @@ -252,7 +252,13 @@ def load_as_feather def load_as_json open_input_stream do |input| - reader = JSONReader.new(input) + options = JSONReadOptions.new + @options.each do |key, value| + next if value.nil? + setter = :"#{key}=" + options.__send__(setter, value) if options.respond_to?(setter) + end + reader = JSONReader.new(input, options) table = reader.read table.refer_input(input) table diff --git a/ruby/red-arrow/test/helper.rb b/ruby/red-arrow/test/helper.rb index 7fa6764dd40c2..42732a5954a6d 100644 --- a/ruby/red-arrow/test/helper.rb +++ b/ruby/red-arrow/test/helper.rb @@ -18,6 +18,7 @@ require "arrow" require "fiddle" +require "json" require "pathname" require "tempfile" require "timeout" diff --git a/ruby/red-arrow/test/test-table.rb b/ruby/red-arrow/test/test-table.rb index 7c372bd44f14a..883cf70c269bb 100644 --- a/ruby/red-arrow/test/test-table.rb +++ b/ruby/red-arrow/test/test-table.rb @@ -677,6 +677,31 @@ def test_tsv format: :tsv, schema: @table.schema)) end + + def test_json + output = create_output(".json") + # TODO: Implement this. + # @table.save(output, format: :json) + columns = "" + @table.each_record.each do |record| + column = { + "count" => record.count, + "visible" => record.visible, + } + columns << column.to_json + columns << "\n" + end + if output.is_a?(String) + File.write(output, columns) + else + output.resize(columns.bytesize) + output.set_data(0, columns) + end + assert_equal(@table, + Arrow::Table.load(output, + format: :json, + schema: @table.schema)) + end end sub_test_case("path") do From 42b995b4f8de239da2be17430706cf4eb795ac50 Mon Sep 17 00:00:00 2001 From: Vibhatha Lakmal Abeykoon Date: Fri, 5 Jan 2024 20:49:07 +0530 Subject: [PATCH 134/570] MINOR: [Java] Bump com.google.errorprone:error_prone_core from 2.4.0 to 2.24.0 in /java (#39452) ### Rationale for this change This is a draft PR for fixing the dependabot PR https://github.com/apache/arrow/pull/39409 ### What changes are included in this PR? Upgrading `com.google.errorprone` to 2.24.0 for JDK11+ and restricting `com.google.errorprone` to 2.10 to JDK8. ### Are these changes tested? N/A. CIs are implicitly testing this including existing test cases. ### Are there any user-facing changes? No Lead-authored-by: vibhatha Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: David Li --- java/pom.xml | 4 ++-- .../apache/arrow/vector/complex/writer/TestComplexWriter.java | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/java/pom.xml b/java/pom.xml index 522ee4abc7669..fae072018eb19 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -44,7 +44,7 @@ 2 true 9+181-r4173-1 - 2.22.0 + 2.24.0 3.11.0 5.5.0 5.2.0 @@ -844,7 +844,7 @@ com.google.errorprone error_prone_core - 2.4.0 + 2.10.0 diff --git a/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java b/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java index 96d39e85f1f4a..e03ce0c056bf1 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java @@ -773,7 +773,7 @@ public void simpleUnion() throws Exception { for (int i = 0; i < COUNT; i++) { unionReader.setPosition(i); if (i % 5 == 0) { - Assert.assertEquals(i, i, unionReader.readInteger()); + Assert.assertEquals(i, unionReader.readInteger().intValue()); } else if (i % 5 == 1) { NullableTimeStampMilliTZHolder holder = new NullableTimeStampMilliTZHolder(); unionReader.read(holder); From aae6fa40b458a90c598df281fdc8fc023e05a262 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Fri, 5 Jan 2024 12:44:45 -0300 Subject: [PATCH 135/570] GH-38772: [C++] Implement directory semantics even when the storage account doesn't support HNS (#39361) ### Rationale for this change The `FileSystem` implementation based on Azure Blob Storage should implement directory operations according to filesystem semantics. When Hierarchical Namespace (HNS) is enabled, we can rely on Azure Data Lake Storage Gen 2 APIs implementing the filesystem semantics for us, but when all we have is the Blobs API, we should emulate it. ### What changes are included in this PR? - Skip fewer tests - Re-implement `GetFileInfo` using `ListBlobsByHierarchy` instead of `ListBlobs` - Re-implement `CreateDir` with an upfront HNS support check instead of falling back to Blobs API after an error - Add comprehensive tests to `CreateDir` - Add `HasSubmitBatchBug` to check if a test inside any scenario is affected by a certain Azurite issue - Implement `DeleteDir` to work properly on flat namespace storage accounts (non-HNS accounts) - ### Are these changes tested? Yes. By existing and new tests added by this PR itself. * Closes: #38772 Authored-by: Felipe Oliveira Carvalho Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/filesystem/azurefs.cc | 709 +++++++++++++------- cpp/src/arrow/filesystem/azurefs_internal.h | 2 +- cpp/src/arrow/filesystem/azurefs_test.cc | 444 +++++++----- 3 files changed, 731 insertions(+), 424 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 029e19bc0e32a..9569eff2e47ed 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -828,7 +828,7 @@ bool IsDfsEmulator(const AzureOptions& options) { namespace internal { Result CheckIfHierarchicalNamespaceIsEnabled( - DataLake::DataLakeFileSystemClient& adlfs_client, const AzureOptions& options) { + const DataLake::DataLakeFileSystemClient& adlfs_client, const AzureOptions& options) { try { auto directory_client = adlfs_client.GetDirectoryClient(""); // GetAccessControlList will fail on storage accounts @@ -891,10 +891,12 @@ namespace { const char kDelimiter[] = {internal::kSep, '\0'}; +/// \pre location.container is not empty. template -Result GetContainerPropsAsFileInfo(const std::string& container_name, - ContainerClient& container_client) { - FileInfo info{container_name}; +Result GetContainerPropsAsFileInfo(const AzureLocation& location, + const ContainerClient& container_client) { + DCHECK(!location.container.empty()); + FileInfo info{location.path.empty() ? location.all : location.container}; try { auto properties = container_client.GetProperties(); info.set_type(FileType::Directory); @@ -910,6 +912,18 @@ Result GetContainerPropsAsFileInfo(const std::string& container_name, } } +template +Status CreateContainerIfNotExists(const std::string& container_name, + const ContainerClient& container_client) { + try { + container_client.CreateIfNotExists(); + return Status::OK(); + } catch (const Storage::StorageException& exception) { + return ExceptionToStatus(exception, "Failed to create a container: ", container_name, + ": ", container_client.GetUrl()); + } +} + FileInfo DirectoryFileInfoFromPath(std::string_view path) { return FileInfo{std::string{internal::RemoveTrailingSlash(path)}, FileType::Directory}; } @@ -955,12 +969,21 @@ class AzureFileSystem::Impl { io::IOContext& io_context() { return io_context_; } const AzureOptions& options() const { return options_; } - private: + Blobs::BlobContainerClient GetBlobContainerClient(const std::string& container_name) { + return blob_service_client_->GetBlobContainerClient(container_name); + } + + /// \param container_name Also known as "filesystem" in the ADLS Gen2 API. + DataLake::DataLakeFileSystemClient GetFileSystemClient( + const std::string& container_name) { + return datalake_service_client_->GetFileSystemClient(container_name); + } + /// \brief Memoized version of CheckIfHierarchicalNamespaceIsEnabled. /// /// \return kEnabled/kDisabled/kContainerNotFound (kUnknown is never returned). Result HierarchicalNamespaceSupport( - DataLake::DataLakeFileSystemClient& adlfs_client) { + const DataLake::DataLakeFileSystemClient& adlfs_client) { switch (cached_hns_support_) { case HNSSupport::kEnabled: case HNSSupport::kDisabled: @@ -987,7 +1010,6 @@ class AzureFileSystem::Impl { return hns_support; } - public: /// This is used from unit tests to ensure we perform operations on all the /// possible states of cached_hns_support_. void ForceCachedHierarchicalNamespaceSupport(int support) { @@ -1004,33 +1026,20 @@ class AzureFileSystem::Impl { DCHECK(false) << "Invalid enum HierarchicalNamespaceSupport value."; } - Result GetFileInfo(const AzureLocation& location) { - if (location.container.empty()) { - DCHECK(location.path.empty()); - // Root directory of the storage account. - return FileInfo{"", FileType::Directory}; - } - if (location.path.empty()) { - // We have a container, but no path within the container. - // The container itself represents a directory. - auto container_client = - blob_service_client_->GetBlobContainerClient(location.container); - return GetContainerPropsAsFileInfo(location.container, container_client); - } - // There is a path to search within the container. - FileInfo info{location.all}; - auto adlfs_client = datalake_service_client_->GetFileSystemClient(location.container); + /// \pre location.path is not empty. + Result GetFileInfo(const DataLake::DataLakeFileSystemClient& adlfs_client, + const AzureLocation& location) { auto file_client = adlfs_client.GetFileClient(location.path); try { + FileInfo info{location.all}; auto properties = file_client.GetProperties(); if (properties.Value.IsDirectory) { info.set_type(FileType::Directory); } else if (internal::HasTrailingSlash(location.path)) { - // For a path with a trailing slash a hierarchical namespace may return a blob - // with that trailing slash removed. For consistency with flat namespace and - // other filesystems we chose to return NotFound. - // - // NOTE(felipecrv): could this be an empty directory marker? + // For a path with a trailing slash, a Hierarchical Namespace storage account + // may recognize a file (path with trailing slash removed). For consistency + // with other arrow::FileSystem implementations we chose to return NotFound + // because the trailing slash means the user was looking for a directory. info.set_type(FileType::NotFound); return info; } else { @@ -1042,47 +1051,88 @@ class AzureFileSystem::Impl { return info; } catch (const Storage::StorageException& exception) { if (exception.StatusCode == Http::HttpStatusCode::NotFound) { - ARROW_ASSIGN_OR_RAISE(auto hns_support, - HierarchicalNamespaceSupport(adlfs_client)); - if (hns_support == HNSSupport::kContainerNotFound || - hns_support == HNSSupport::kEnabled) { - // If the hierarchical namespace is enabled, then the storage account will - // have explicit directories. Neither a file nor a directory was found. - info.set_type(FileType::NotFound); + return FileInfo{location.all, FileType::NotFound}; + } + return ExceptionToStatus( + exception, "GetProperties for '", file_client.GetUrl(), + "' failed. GetFileInfo is unable to determine whether the path exists."); + } + } + + /// On flat namespace accounts there are no real directories. Directories are + /// implied by empty directory marker blobs with names ending in "/" or there + /// being blobs with names starting with the directory path. + /// + /// \pre location.path is not empty. + Result GetFileInfo(const Blobs::BlobContainerClient& container_client, + const AzureLocation& location) { + DCHECK(!location.path.empty()); + Blobs::ListBlobsOptions options; + options.Prefix = internal::RemoveTrailingSlash(location.path); + options.PageSizeHint = 1; + + try { + FileInfo info{location.all}; + auto list_response = container_client.ListBlobsByHierarchy(kDelimiter, options); + // Since PageSizeHint=1, we expect at most one entry in either Blobs or + // BlobPrefixes. A BlobPrefix always ends with kDelimiter ("/"), so we can + // distinguish between a directory and a file by checking if we received a + // prefix or a blob. + if (!list_response.BlobPrefixes.empty()) { + // Ensure the returned BlobPrefixes[0] string doesn't contain more characters than + // the requested Prefix. For instance, if we request with Prefix="dir/abra" and + // the container contains "dir/abracadabra/" but not "dir/abra/", we will get back + // "dir/abracadabra/" in the BlobPrefixes list. If "dir/abra/" existed, + // it would be returned instead because it comes before "dir/abracadabra/" in the + // lexicographic order guaranteed by ListBlobsByHierarchy. + const auto& blob_prefix = list_response.BlobPrefixes[0]; + if (blob_prefix == internal::EnsureTrailingSlash(location.path)) { + info.set_type(FileType::Directory); return info; } - // On flat namespace accounts there are no real directories. Directories are only - // implied by using `/` in the blob name. - Blobs::ListBlobsOptions list_blob_options; - // If listing the prefix `path.path_to_file` with trailing slash returns at least - // one result then `path` refers to an implied directory. - list_blob_options.Prefix = internal::EnsureTrailingSlash(location.path); - // We only need to know if there is at least one result, so minimise page size - // for efficiency. - list_blob_options.PageSizeHint = 1; - - try { - auto paged_list_result = - blob_service_client_->GetBlobContainerClient(location.container) - .ListBlobs(list_blob_options); - auto file_type = paged_list_result.Blobs.size() > 0 ? FileType::Directory - : FileType::NotFound; - info.set_type(file_type); + } + if (!list_response.Blobs.empty()) { + const auto& blob = list_response.Blobs[0]; + if (blob.Name == location.path) { + info.set_type(FileType::File); + info.set_size(blob.BlobSize); + info.set_mtime( + std::chrono::system_clock::time_point{blob.Details.LastModified}); return info; - } catch (const Storage::StorageException& exception) { - return ExceptionToStatus( - exception, "ListBlobs failed for prefix='", *list_blob_options.Prefix, - "' failed. GetFileInfo is unable to determine whether the path should " - "be considered an implied directory."); } } + info.set_type(FileType::NotFound); + return info; + } catch (const Storage::StorageException& exception) { + if (IsContainerNotFound(exception)) { + return FileInfo{location.all, FileType::NotFound}; + } return ExceptionToStatus( - exception, "GetProperties failed for '", file_client.GetUrl(), - "' GetFileInfo is unable to determine whether the path exists."); + exception, "ListBlobsByHierarchy failed for prefix='", *options.Prefix, + "'. GetFileInfo is unable to determine whether the path exists."); } } private: + /// \pref location.container is not empty. + template + Status CheckDirExists(const ContainerClient& container_client, + const AzureLocation& location) { + DCHECK(!location.container.empty()); + FileInfo info; + if (location.path.empty()) { + ARROW_ASSIGN_OR_RAISE(info, + GetContainerPropsAsFileInfo(location, container_client)); + } else { + ARROW_ASSIGN_OR_RAISE(info, GetFileInfo(container_client, location)); + } + if (info.type() == FileType::NotFound) { + return PathNotFound(location); + } + DCHECK_EQ(info.type(), FileType::Directory); + return Status::OK(); + } + template Status VisitContainers(const Core::Context& context, OnContainer&& on_container) const { Blobs::ListBlobContainersOptions options; @@ -1297,97 +1347,79 @@ class AzureFileSystem::Impl { return ptr; } - Status CreateDir(const AzureLocation& location) { - if (location.container.empty()) { - return Status::Invalid("CreateDir requires a non-empty path."); - } - - auto container_client = - blob_service_client_->GetBlobContainerClient(location.container); - if (location.path.empty()) { - try { - auto response = container_client.Create(); - return response.Value.Created - ? Status::OK() - : Status::AlreadyExists("Directory already exists: " + location.all); - } catch (const Storage::StorageException& exception) { - return ExceptionToStatus(exception, - "Failed to create a container: ", location.container, - ": ", container_client.GetUrl()); - } - } - - auto adlfs_client = datalake_service_client_->GetFileSystemClient(location.container); - ARROW_ASSIGN_OR_RAISE(auto hns_support, HierarchicalNamespaceSupport(adlfs_client)); - if (hns_support == HNSSupport::kContainerNotFound) { - return PathNotFound(location); - } - if (hns_support == HNSSupport::kDisabled) { - ARROW_ASSIGN_OR_RAISE( - auto container_info, - GetContainerPropsAsFileInfo(location.container, container_client)); - if (container_info.type() == FileType::NotFound) { - return PathNotFound(location); - } - // Without hierarchical namespace enabled Azure blob storage has no directories. - // Therefore we can't, and don't need to create one. Simply creating a blob with `/` - // in the name implies directories. - return Status::OK(); - } - - auto directory_client = adlfs_client.GetDirectoryClient(location.path); - try { - auto response = directory_client.Create(); - if (response.Value.Created) { - return Status::OK(); - } else { - return StatusFromErrorResponse(directory_client.GetUrl(), *response.RawResponse, - "Failed to create a directory: " + location.path); + private: + /// This function cannot assume the filesystem/container already exists. + /// + /// \pre location.container is not empty. + /// \pre location.path is not empty. + template + Status CreateDirTemplate(const ContainerClient& container_client, + CreateDirIfNotExists&& create_if_not_exists, + const AzureLocation& location, bool recursive) { + DCHECK(!location.container.empty()); + DCHECK(!location.path.empty()); + // Non-recursive CreateDir calls require the parent directory to exist. + if (!recursive) { + auto parent = location.parent(); + if (!parent.path.empty()) { + RETURN_NOT_OK(CheckDirExists(container_client, parent)); } - } catch (const Storage::StorageException& exception) { - return ExceptionToStatus(exception, "Failed to create a directory: ", location.path, - ": ", directory_client.GetUrl()); + // If the parent location is just the container, we don't need to check if it + // exists because the operation we perform below will fail if the container + // doesn't exist and we can handle that error according to the recursive flag. } - } - - Status CreateDirRecursive(const AzureLocation& location) { - if (location.container.empty()) { - return Status::Invalid("CreateDir requires a non-empty path."); - } - - auto container_client = - blob_service_client_->GetBlobContainerClient(location.container); try { - container_client.CreateIfNotExists(); - } catch (const Storage::StorageException& exception) { - return ExceptionToStatus(exception, - "Failed to create a container: ", location.container, " (", - container_client.GetUrl(), ")"); - } - - auto adlfs_client = datalake_service_client_->GetFileSystemClient(location.container); - ARROW_ASSIGN_OR_RAISE(auto hns_support, HierarchicalNamespaceSupport(adlfs_client)); - if (hns_support == HNSSupport::kDisabled) { - // Without hierarchical namespace enabled Azure blob storage has no directories. - // Therefore we can't, and don't need to create one. Simply creating a blob with `/` - // in the name implies directories. + create_if_not_exists(container_client, location); return Status::OK(); - } - // Don't handle HNSSupport::kContainerNotFound, just assume it still exists (because - // it was created above) and try to create the directory. - - if (!location.path.empty()) { - auto directory_client = adlfs_client.GetDirectoryClient(location.path); - try { - directory_client.CreateIfNotExists(); - } catch (const Storage::StorageException& exception) { - return ExceptionToStatus(exception, - "Failed to create a directory: ", location.path, " (", - directory_client.GetUrl(), ")"); + } catch (const Storage::StorageException& exception) { + if (IsContainerNotFound(exception)) { + try { + if (recursive) { + container_client.CreateIfNotExists(); + create_if_not_exists(container_client, location); + return Status::OK(); + } else { + auto parent = location.parent(); + return PathNotFound(parent); + } + } catch (const Storage::StorageException& second_exception) { + return ExceptionToStatus(second_exception, "Failed to create directory '", + location.all, "': ", container_client.GetUrl()); + } } + return ExceptionToStatus(exception, "Failed to create directory '", location.all, + "': ", container_client.GetUrl()); } + } - return Status::OK(); + public: + /// This function cannot assume the filesystem already exists. + /// + /// \pre location.container is not empty. + /// \pre location.path is not empty. + Status CreateDirOnFileSystem(const DataLake::DataLakeFileSystemClient& adlfs_client, + const AzureLocation& location, bool recursive) { + return CreateDirTemplate( + adlfs_client, + [](const auto& adlfs_client, const auto& location) { + auto directory_client = adlfs_client.GetDirectoryClient(location.path); + directory_client.CreateIfNotExists(); + }, + location, recursive); + } + + /// This function cannot assume the container already exists. + /// + /// \pre location.container is not empty. + /// \pre location.path is not empty. + Status CreateDirOnContainer(const Blobs::BlobContainerClient& container_client, + const AzureLocation& location, bool recursive) { + return CreateDirTemplate( + container_client, + [this](const auto& container_client, const auto& location) { + EnsureEmptyDirExistsImplThatThrows(container_client, location.path); + }, + location, recursive); } Result> OpenAppendStream( @@ -1414,10 +1446,92 @@ class AzureFileSystem::Impl { } private: - Status DeleteDirContentsWithoutHierarchicalNamespace(const AzureLocation& location, - bool missing_dir_ok) { - auto container_client = - blob_service_client_->GetBlobContainerClient(location.container); + void EnsureEmptyDirExistsImplThatThrows( + const Blobs::BlobContainerClient& container_client, + const std::string& path_within_container) { + auto dir_marker_blob_path = internal::EnsureTrailingSlash(path_within_container); + auto block_blob_client = + container_client.GetBlobClient(dir_marker_blob_path).AsBlockBlobClient(); + // Attach metadata that other filesystem implementations expect to be present + // on directory marker blobs. + // https://github.com/fsspec/adlfs/blob/32132c4094350fca2680155a5c236f2e9f991ba5/adlfs/spec.py#L855-L870 + Blobs::UploadBlockBlobFromOptions blob_options; + blob_options.Metadata.emplace("is_directory", "true"); + block_blob_client.UploadFrom(nullptr, 0, blob_options); + } + + public: + /// This function assumes the container already exists. So it can only be + /// called after that has been verified. + /// + /// \pre location.container is not empty. + /// \pre The location.container container already exists. + Status EnsureEmptyDirExists(const Blobs::BlobContainerClient& container_client, + const AzureLocation& location, const char* operation_name) { + DCHECK(!location.container.empty()); + if (location.path.empty()) { + // Nothing to do. The container already exists per the preconditions. + return Status::OK(); + } + try { + EnsureEmptyDirExistsImplThatThrows(container_client, location.path); + return Status::OK(); + } catch (const Storage::StorageException& exception) { + return ExceptionToStatus( + exception, operation_name, " failed to ensure empty directory marker '", + location.path, "' exists in container: ", container_client.GetUrl()); + } + } + + /// \pre location.container is not empty. + /// \pre location.path is empty. + Status DeleteContainer(const Blobs::BlobContainerClient& container_client, + const AzureLocation& location) { + DCHECK(!location.container.empty()); + DCHECK(location.path.empty()); + try { + auto response = container_client.Delete(); + if (response.Value.Deleted) { + return Status::OK(); + } else { + return StatusFromErrorResponse( + container_client.GetUrl(), *response.RawResponse, + "Failed to delete a container: " + location.container); + } + } catch (const Storage::StorageException& exception) { + if (IsContainerNotFound(exception)) { + return PathNotFound(location); + } + return ExceptionToStatus(exception, + "Failed to delete a container: ", location.container, ": ", + container_client.GetUrl()); + } + } + + /// Deletes contents of a directory and possibly the directory itself + /// depending on the value of preserve_dir_marker_blob. + /// + /// \pre location.container is not empty. + /// \pre preserve_dir_marker_blob=false implies location.path is not empty + /// because we can't *not preserve* the root directory of a container. + /// + /// \param require_dir_to_exist Require the directory to exist *before* this + /// operation, otherwise return PathNotFound. + /// \param preserve_dir_marker_blob Ensure the empty directory marker blob + /// is preserved (not deleted) or created (before the contents are deleted) if it + /// doesn't exist explicitly but is implied by the existence of blobs with names + /// starting with the directory path. + /// \param operation_name Used in error messages to accurately describe the operation + Status DeleteDirContentsOnContainer(const Blobs::BlobContainerClient& container_client, + const AzureLocation& location, + bool require_dir_to_exist, + bool preserve_dir_marker_blob, + const char* operation_name) { + using DeleteBlobResponse = Storage::DeferredResponse; + DCHECK(!location.container.empty()); + DCHECK(preserve_dir_marker_blob || !location.path.empty()) + << "Must pass preserve_dir_marker_blob=true when location.path is empty " + "(i.e. deleting the contents of a container)."; Blobs::ListBlobsOptions options; if (!location.path.empty()) { options.Prefix = internal::EnsureTrailingSlash(location.path); @@ -1428,9 +1542,11 @@ class AzureFileSystem::Impl { // size of the body for a batch request can't exceed 4 MB. const int32_t kNumMaxRequestsInBatch = 256; options.PageSizeHint = kNumMaxRequestsInBatch; + // trusted only if preserve_dir_marker_blob is true. + bool found_dir_marker_blob = false; try { auto list_response = container_client.ListBlobs(options); - if (!missing_dir_ok && list_response.Blobs.empty()) { + if (require_dir_to_exist && list_response.Blobs.empty()) { return PathNotFound(location); } for (; list_response.HasPage(); list_response.MoveToNextPage()) { @@ -1438,20 +1554,44 @@ class AzureFileSystem::Impl { continue; } auto batch = container_client.CreateBatch(); - std::vector> - deferred_responses; + std::vector> deferred_responses; for (const auto& blob_item : list_response.Blobs) { - deferred_responses.push_back(batch.DeleteBlob(blob_item.Name)); + if (preserve_dir_marker_blob && !found_dir_marker_blob) { + const bool is_dir_marker_blob = + options.Prefix.HasValue() && blob_item.Name == *options.Prefix; + if (is_dir_marker_blob) { + // Skip deletion of the existing directory marker blob, + // but take note that it exists. + found_dir_marker_blob = true; + continue; + } + } + deferred_responses.emplace_back(blob_item.Name, + batch.DeleteBlob(blob_item.Name)); } try { - container_client.SubmitBatch(batch); + // Before submitting the batch deleting directory contents, ensure + // the empty directory marker blob exists. Doing this first, means that + // directory doesn't "stop existing" during the duration of the batch delete + // operation. + if (preserve_dir_marker_blob && !found_dir_marker_blob) { + // Only create an empty directory marker blob if the directory's + // existence is implied by the existence of blobs with names + // starting with the directory path. + if (!deferred_responses.empty()) { + RETURN_NOT_OK( + EnsureEmptyDirExists(container_client, location, operation_name)); + } + } + if (!deferred_responses.empty()) { + container_client.SubmitBatch(batch); + } } catch (const Storage::StorageException& exception) { return ExceptionToStatus(exception, "Failed to delete blobs in a directory: ", location.path, ": ", container_client.GetUrl()); } std::vector failed_blob_names; - for (size_t i = 0; i < deferred_responses.size(); ++i) { - const auto& deferred_response = deferred_responses[i]; + for (auto& [blob_name_view, deferred_response] : deferred_responses) { bool success = true; try { auto delete_result = deferred_response.GetResponse(); @@ -1460,8 +1600,7 @@ class AzureFileSystem::Impl { success = false; } if (!success) { - const auto& blob_item = list_response.Blobs[i]; - failed_blob_names.push_back(blob_item.Name); + failed_blob_names.emplace_back(blob_name_view); } } if (!failed_blob_names.empty()) { @@ -1475,117 +1614,74 @@ class AzureFileSystem::Impl { } } } + return Status::OK(); } catch (const Storage::StorageException& exception) { return ExceptionToStatus(exception, "Failed to list blobs in a directory: ", location.path, ": ", container_client.GetUrl()); } - return Status::OK(); } - public: - Status DeleteDir(const AzureLocation& location) { - if (location.container.empty()) { - return Status::Invalid("DeleteDir requires a non-empty path."); - } - - auto adlfs_client = datalake_service_client_->GetFileSystemClient(location.container); - ARROW_ASSIGN_OR_RAISE(auto hns_support, HierarchicalNamespaceSupport(adlfs_client)); - if (hns_support == HNSSupport::kContainerNotFound) { - return PathNotFound(location); - } - - if (location.path.empty()) { - auto container_client = - blob_service_client_->GetBlobContainerClient(location.container); - try { - auto response = container_client.Delete(); - if (response.Value.Deleted) { - return Status::OK(); - } else { - return StatusFromErrorResponse( - container_client.GetUrl(), *response.RawResponse, - "Failed to delete a container: " + location.container); - } - } catch (const Storage::StorageException& exception) { - return ExceptionToStatus(exception, - "Failed to delete a container: ", location.container, - ": ", container_client.GetUrl()); - } - } - - if (hns_support == HNSSupport::kEnabled) { - auto directory_client = adlfs_client.GetDirectoryClient(location.path); - try { - auto response = directory_client.DeleteRecursive(); - if (response.Value.Deleted) { - return Status::OK(); - } else { - return StatusFromErrorResponse( - directory_client.GetUrl(), *response.RawResponse, - "Failed to delete a directory: " + location.path); - } - } catch (const Storage::StorageException& exception) { - return ExceptionToStatus(exception, - "Failed to delete a directory: ", location.path, ": ", - directory_client.GetUrl()); + /// \pre location.container is not empty. + /// \pre location.path is not empty. + Status DeleteDirOnFileSystem(const DataLake::DataLakeFileSystemClient& adlfs_client, + const AzureLocation& location) { + DCHECK(!location.container.empty()); + DCHECK(!location.path.empty()); + auto directory_client = adlfs_client.GetDirectoryClient(location.path); + // XXX: should "directory not found" be considered an error? + try { + auto response = directory_client.DeleteRecursive(); + if (response.Value.Deleted) { + return Status::OK(); + } else { + return StatusFromErrorResponse(directory_client.GetUrl(), *response.RawResponse, + "Failed to delete a directory: " + location.path); } - } else { - return DeleteDirContentsWithoutHierarchicalNamespace(location, - /*missing_dir_ok=*/true); + } catch (const Storage::StorageException& exception) { + return ExceptionToStatus(exception, "Failed to delete a directory: ", location.path, + ": ", directory_client.GetUrl()); } } - Status DeleteDirContents(const AzureLocation& location, bool missing_dir_ok) { - if (location.container.empty()) { - return internal::InvalidDeleteDirContents(location.all); - } - - auto adlfs_client = datalake_service_client_->GetFileSystemClient(location.container); - ARROW_ASSIGN_OR_RAISE(auto hns_support, HierarchicalNamespaceSupport(adlfs_client)); - if (hns_support == HNSSupport::kContainerNotFound) { - return missing_dir_ok ? Status::OK() : PathNotFound(location); - } - - if (hns_support == HNSSupport::kEnabled) { - auto directory_client = adlfs_client.GetDirectoryClient(location.path); - try { - auto list_response = directory_client.ListPaths(false); - for (; list_response.HasPage(); list_response.MoveToNextPage()) { - for (const auto& path : list_response.Paths) { - if (path.IsDirectory) { - auto sub_directory_client = adlfs_client.GetDirectoryClient(path.Name); - try { - sub_directory_client.DeleteRecursive(); - } catch (const Storage::StorageException& exception) { - return ExceptionToStatus( - exception, "Failed to delete a sub directory: ", location.container, - kDelimiter, path.Name, ": ", sub_directory_client.GetUrl()); - } - } else { - auto sub_file_client = adlfs_client.GetFileClient(path.Name); - try { - sub_file_client.Delete(); - } catch (const Storage::StorageException& exception) { - return ExceptionToStatus( - exception, "Failed to delete a sub file: ", location.container, - kDelimiter, path.Name, ": ", sub_file_client.GetUrl()); - } + /// \pre location.container is not empty. + Status DeleteDirContentsOnFileSystem( + const DataLake::DataLakeFileSystemClient& adlfs_client, + const AzureLocation& location, bool missing_dir_ok) { + auto directory_client = adlfs_client.GetDirectoryClient(location.path); + try { + auto list_response = directory_client.ListPaths(false); + for (; list_response.HasPage(); list_response.MoveToNextPage()) { + for (const auto& path : list_response.Paths) { + if (path.IsDirectory) { + auto sub_directory_client = adlfs_client.GetDirectoryClient(path.Name); + try { + sub_directory_client.DeleteRecursive(); + } catch (const Storage::StorageException& exception) { + return ExceptionToStatus( + exception, "Failed to delete a sub directory: ", location.container, + kDelimiter, path.Name, ": ", sub_directory_client.GetUrl()); + } + } else { + auto sub_file_client = adlfs_client.GetFileClient(path.Name); + try { + sub_file_client.Delete(); + } catch (const Storage::StorageException& exception) { + return ExceptionToStatus( + exception, "Failed to delete a sub file: ", location.container, + kDelimiter, path.Name, ": ", sub_file_client.GetUrl()); } } } - } catch (const Storage::StorageException& exception) { - if (missing_dir_ok && exception.StatusCode == Http::HttpStatusCode::NotFound) { - return Status::OK(); - } else { - return ExceptionToStatus(exception, - "Failed to delete directory contents: ", location.path, - ": ", directory_client.GetUrl()); - } } return Status::OK(); - } else { - return DeleteDirContentsWithoutHierarchicalNamespace(location, missing_dir_ok); + } catch (const Storage::StorageException& exception) { + if (missing_dir_ok && exception.StatusCode == Http::HttpStatusCode::NotFound) { + return Status::OK(); + } + return ExceptionToStatus(exception, + "Failed to delete directory contents: ", location.path, + ": ", directory_client.GetUrl()); } } @@ -1640,7 +1736,30 @@ bool AzureFileSystem::Equals(const FileSystem& other) const { Result AzureFileSystem::GetFileInfo(const std::string& path) { ARROW_ASSIGN_OR_RAISE(auto location, AzureLocation::FromString(path)); - return impl_->GetFileInfo(location); + if (location.container.empty()) { + DCHECK(location.path.empty()); + // Root directory of the storage account. + return FileInfo{"", FileType::Directory}; + } + if (location.path.empty()) { + // We have a container, but no path within the container. + // The container itself represents a directory. + auto container_client = impl_->GetBlobContainerClient(location.container); + return GetContainerPropsAsFileInfo(location, container_client); + } + // There is a path to search within the container. Check HNS support to proceed. + auto adlfs_client = impl_->GetFileSystemClient(location.container); + ARROW_ASSIGN_OR_RAISE(auto hns_support, + impl_->HierarchicalNamespaceSupport(adlfs_client)); + if (hns_support == HNSSupport::kContainerNotFound) { + return FileInfo{location.all, FileType::NotFound}; + } + if (hns_support == HNSSupport::kEnabled) { + return impl_->GetFileInfo(adlfs_client, location); + } + DCHECK_EQ(hns_support, HNSSupport::kDisabled); + auto container_client = impl_->GetBlobContainerClient(location.container); + return impl_->GetFileInfo(container_client, location); } Result AzureFileSystem::GetFileInfo(const FileSelector& select) { @@ -1654,21 +1773,95 @@ Result AzureFileSystem::GetFileInfo(const FileSelector& select) Status AzureFileSystem::CreateDir(const std::string& path, bool recursive) { ARROW_ASSIGN_OR_RAISE(auto location, AzureLocation::FromString(path)); - if (recursive) { - return impl_->CreateDirRecursive(location); - } else { - return impl_->CreateDir(location); + if (location.container.empty()) { + return Status::Invalid("CreateDir requires a non-empty path."); } + + auto container_client = impl_->GetBlobContainerClient(location.container); + if (location.path.empty()) { + // If the path is just the container, the parent (root) trivially exists, + // and the CreateDir operation comes down to just creating the container. + return CreateContainerIfNotExists(location.container, container_client); + } + + auto adlfs_client = impl_->GetFileSystemClient(location.container); + ARROW_ASSIGN_OR_RAISE(auto hns_support, + impl_->HierarchicalNamespaceSupport(adlfs_client)); + if (hns_support == HNSSupport::kContainerNotFound) { + if (!recursive) { + auto parent = location.parent(); + return PathNotFound(parent); + } + RETURN_NOT_OK(CreateContainerIfNotExists(location.container, container_client)); + // Perform a second check for HNS support after creating the container. + ARROW_ASSIGN_OR_RAISE(hns_support, impl_->HierarchicalNamespaceSupport(adlfs_client)); + if (hns_support == HNSSupport::kContainerNotFound) { + // We only get kContainerNotFound if we are unable to read the properties of the + // container we just created. This is very unlikely, but theoretically possible in + // a concurrent system, so the error is handled to avoid infinite recursion. + return Status::IOError("Unable to read properties of a newly created container: ", + location.container, ": " + container_client.GetUrl()); + } + } + // CreateDirOnFileSystem and CreateDirOnContainer can handle the container + // not existing which is useful and necessary here since the only reason + // a container was created above was to check for HNS support when it wasn't + // cached yet. + if (hns_support == HNSSupport::kEnabled) { + return impl_->CreateDirOnFileSystem(adlfs_client, location, recursive); + } + DCHECK_EQ(hns_support, HNSSupport::kDisabled); + return impl_->CreateDirOnContainer(container_client, location, recursive); } Status AzureFileSystem::DeleteDir(const std::string& path) { ARROW_ASSIGN_OR_RAISE(auto location, AzureLocation::FromString(path)); - return impl_->DeleteDir(location); + if (location.container.empty()) { + return Status::Invalid("DeleteDir requires a non-empty path."); + } + if (location.path.empty()) { + auto container_client = impl_->GetBlobContainerClient(location.container); + return impl_->DeleteContainer(container_client, location); + } + + auto adlfs_client = impl_->GetFileSystemClient(location.container); + ARROW_ASSIGN_OR_RAISE(auto hns_support, + impl_->HierarchicalNamespaceSupport(adlfs_client)); + if (hns_support == HNSSupport::kContainerNotFound) { + return PathNotFound(location); + } + if (hns_support == HNSSupport::kEnabled) { + return impl_->DeleteDirOnFileSystem(adlfs_client, location); + } + DCHECK_EQ(hns_support, HNSSupport::kDisabled); + auto container_client = impl_->GetBlobContainerClient(location.container); + return impl_->DeleteDirContentsOnContainer(container_client, location, + /*require_dir_to_exist=*/true, + /*preserve_dir_marker_blob=*/false, + "DeleteDir"); } Status AzureFileSystem::DeleteDirContents(const std::string& path, bool missing_dir_ok) { ARROW_ASSIGN_OR_RAISE(auto location, AzureLocation::FromString(path)); - return impl_->DeleteDirContents(location, missing_dir_ok); + if (location.container.empty()) { + return internal::InvalidDeleteDirContents(location.all); + } + + auto adlfs_client = impl_->GetFileSystemClient(location.container); + ARROW_ASSIGN_OR_RAISE(auto hns_support, + impl_->HierarchicalNamespaceSupport(adlfs_client)); + if (hns_support == HNSSupport::kContainerNotFound) { + return missing_dir_ok ? Status::OK() : PathNotFound(location); + } + + if (hns_support == HNSSupport::kEnabled) { + return impl_->DeleteDirContentsOnFileSystem(adlfs_client, location, missing_dir_ok); + } + auto container_client = impl_->GetBlobContainerClient(location.container); + return impl_->DeleteDirContentsOnContainer(container_client, location, + /*require_dir_to_exist=*/!missing_dir_ok, + /*preserve_dir_marker_blob=*/true, + "DeleteDirContents"); } Status AzureFileSystem::DeleteRootDirContents() { diff --git a/cpp/src/arrow/filesystem/azurefs_internal.h b/cpp/src/arrow/filesystem/azurefs_internal.h index 13d84c9b542b4..5642e16bcfb05 100644 --- a/cpp/src/arrow/filesystem/azurefs_internal.h +++ b/cpp/src/arrow/filesystem/azurefs_internal.h @@ -71,7 +71,7 @@ enum class HierarchicalNamespaceSupport { /// \return kEnabled/kDisabled/kContainerNotFound (kUnknown is never /// returned). Result CheckIfHierarchicalNamespaceIsEnabled( - Azure::Storage::Files::DataLake::DataLakeFileSystemClient& adlfs_client, + const Azure::Storage::Files::DataLake::DataLakeFileSystemClient& adlfs_client, const arrow::fs::AzureOptions& options); } // namespace internal diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index f6af9f722dbac..ff94578b041dc 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -473,6 +473,14 @@ class TestAzureFileSystem : public ::testing::Test { return blob_client; } + Blobs::Models::BlobProperties GetBlobProperties(const std::string& container_name, + const std::string& blob_name) { + return blob_service_client_->GetBlobContainerClient(container_name) + .GetBlobClient(blob_name) + .GetProperties() + .Value; + } + void UploadLines(const std::vector& lines, const std::string& path, int total_size) { ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(path, {})); @@ -566,86 +574,259 @@ class TestAzureFileSystem : public ::testing::Test { return env->WithHierarchicalNamespace(); } + constexpr static const char* const kSubmitBatchBugMessage = + "This test is affected by an Azurite issue: " + "https://github.com/Azure/Azurite/pull/2302"; + + /// Azurite has a bug that causes BlobContainerClient::SubmitBatch to fail on macOS. + /// SubmitBatch is used by: + /// - AzureFileSystem::DeleteDir + /// - AzureFileSystem::DeleteDirContents + bool HasSubmitBatchBug() const { +#ifdef __APPLE__ + EXPECT_OK_AND_ASSIGN(auto env, GetAzureEnv()); + return env->backend() == AzureBackend::kAzurite; +#else + return false; +#endif + } + // Tests that are called from more than one implementation of TestAzureFileSystem void TestDetectHierarchicalNamespace(bool trip_up_azurite); void TestDetectHierarchicalNamespaceOnMissingContainer(); - void TestGetFileInfoObject(); + + void TestGetFileInfoOfRoot() { + AssertFileInfo(fs(), "", FileType::Directory); + + // URI + ASSERT_RAISES(Invalid, fs()->GetFileInfo("abfs://")); + } + + void TestGetFileInfoOnExistingContainer() { + auto data = SetUpPreexistingData(); + AssertFileInfo(fs(), data.container_name, FileType::Directory); + AssertFileInfo(fs(), data.container_name + "/", FileType::Directory); + auto props = GetBlobProperties(data.container_name, data.kObjectName); + AssertFileInfo(fs(), data.ObjectPath(), FileType::File, + std::chrono::system_clock::time_point{props.LastModified}, + static_cast(props.BlobSize)); + AssertFileInfo(fs(), data.NotFoundObjectPath(), FileType::NotFound); + AssertFileInfo(fs(), data.ObjectPath() + "/", FileType::NotFound); + AssertFileInfo(fs(), data.NotFoundObjectPath() + "/", FileType::NotFound); + + // URIs + ASSERT_RAISES(Invalid, fs()->GetFileInfo("abfs://" + data.container_name)); + ASSERT_RAISES(Invalid, fs()->GetFileInfo("abfs://" + std::string{data.kObjectName})); + ASSERT_RAISES(Invalid, fs()->GetFileInfo("abfs://" + data.ObjectPath())); + } + + void TestGetFileInfoOnMissingContainer() { + auto data = SetUpPreexistingData(); + AssertFileInfo(fs(), "nonexistent", FileType::NotFound); + AssertFileInfo(fs(), "nonexistent/object", FileType::NotFound); + AssertFileInfo(fs(), "nonexistent/object/", FileType::NotFound); + } + void TestGetFileInfoObjectWithNestedStructure(); + void TestCreateDirOnRoot() { + auto dir1 = PreexistingData::RandomContainerName(rng_); + auto dir2 = PreexistingData::RandomContainerName(rng_); + + AssertFileInfo(fs(), dir1, FileType::NotFound); + ASSERT_OK(fs()->CreateDir(dir1, false)); + AssertFileInfo(fs(), dir1, FileType::Directory); + + AssertFileInfo(fs(), dir2, FileType::NotFound); + ASSERT_OK(fs()->CreateDir(dir2, true)); + AssertFileInfo(fs(), dir1, FileType::Directory); + + // Should not fail if the directory already exists. + ASSERT_OK(fs()->CreateDir(dir1, false)); + ASSERT_OK(fs()->CreateDir(dir1, true)); + AssertFileInfo(fs(), dir1, FileType::Directory); + } + + void TestCreateDirOnExistingContainer() { + auto data = SetUpPreexistingData(); + auto dir1 = data.RandomDirectoryPath(rng_); + auto dir2 = data.RandomDirectoryPath(rng_); + + AssertFileInfo(fs(), dir1, FileType::NotFound); + ASSERT_OK(fs()->CreateDir(dir1, /*recursive=*/false)); + AssertFileInfo(fs(), dir1, FileType::Directory); + + AssertFileInfo(fs(), dir2, FileType::NotFound); + ASSERT_OK(fs()->CreateDir(dir2, /*recursive=*/true)); + AssertFileInfo(fs(), dir2, FileType::Directory); + + auto subdir1 = ConcatAbstractPath(dir1, "subdir"); + auto subdir2 = ConcatAbstractPath(dir2, "subdir"); + AssertFileInfo(fs(), subdir1, FileType::NotFound); + ASSERT_OK(fs()->CreateDir(subdir1, /*recursive=*/false)); + AssertFileInfo(fs(), subdir1, FileType::Directory); + AssertFileInfo(fs(), subdir2, FileType::NotFound); + ASSERT_OK(fs()->CreateDir(subdir2, /*recursive=*/true)); + AssertFileInfo(fs(), subdir2, FileType::Directory); + + auto dir3 = data.RandomDirectoryPath(rng_); + AssertFileInfo(fs(), dir3, FileType::NotFound); + auto subdir3 = ConcatAbstractPath(dir3, "subdir"); + AssertFileInfo(fs(), subdir3, FileType::NotFound); + // Creating subdir3 with recursive=false should fail. + EXPECT_RAISES_WITH_MESSAGE_THAT( + IOError, ::testing::HasSubstr("Path does not exist '" + dir3 + "'"), + fs()->CreateDir(subdir3, /*recursive=*/false)); + AssertFileInfo(fs(), dir3, FileType::NotFound); + AssertFileInfo(fs(), subdir3, FileType::NotFound); + // Creating subdir3 with recursive=true should work. + ASSERT_OK(fs()->CreateDir(subdir3, /*recursive=*/true)); + AssertFileInfo(fs(), dir3, FileType::Directory); + AssertFileInfo(fs(), subdir3, FileType::Directory); + + auto dir4 = data.RandomDirectoryPath(rng_); + auto subdir4 = ConcatAbstractPath(dir4, "subdir4"); + auto subdir5 = ConcatAbstractPath(dir4, "subdir4/subdir5"); + // Creating subdir4 with recursive=false should fail. + EXPECT_RAISES_WITH_MESSAGE_THAT( + IOError, ::testing::HasSubstr("Path does not exist '" + dir4 + "'"), + fs()->CreateDir(subdir4, /*recursive=*/false)); + AssertFileInfo(fs(), dir4, FileType::NotFound); + AssertFileInfo(fs(), subdir4, FileType::NotFound); + // Creating subdir5 with recursive=false should fail. + EXPECT_RAISES_WITH_MESSAGE_THAT( + IOError, ::testing::HasSubstr("Path does not exist '" + subdir4 + "'"), + fs()->CreateDir(subdir5, /*recursive=*/false)); + AssertFileInfo(fs(), dir4, FileType::NotFound); + AssertFileInfo(fs(), subdir4, FileType::NotFound); + AssertFileInfo(fs(), subdir5, FileType::NotFound); + // Creating subdir5 with recursive=true should work. + ASSERT_OK(fs()->CreateDir(subdir5, /*recursive=*/true)); + AssertFileInfo(fs(), dir4, FileType::Directory); + AssertFileInfo(fs(), subdir4, FileType::Directory); + AssertFileInfo(fs(), subdir5, FileType::Directory); + } + + void TestCreateDirOnMissingContainer() { + auto container1 = PreexistingData::RandomContainerName(rng_); + auto container2 = PreexistingData::RandomContainerName(rng_); + AssertFileInfo(fs(), container1, FileType::NotFound); + AssertFileInfo(fs(), container2, FileType::NotFound); + + auto dir1 = ConcatAbstractPath(container1, "dir"); + AssertFileInfo(fs(), dir1, FileType::NotFound); + // Creating dir1 with recursive=false should fail. + EXPECT_RAISES_WITH_MESSAGE_THAT( + IOError, ::testing::HasSubstr("Path does not exist '" + container1 + "'"), + fs()->CreateDir(dir1, /*recursive=*/false)); + AssertFileInfo(fs(), container1, FileType::NotFound); + AssertFileInfo(fs(), dir1, FileType::NotFound); + // Creating dir1 with recursive=true should work. + ASSERT_OK(fs()->CreateDir(dir1, /*recursive=*/true)); + AssertFileInfo(fs(), container1, FileType::Directory); + AssertFileInfo(fs(), dir1, FileType::Directory); + + auto dir2 = ConcatAbstractPath(container2, "dir"); + auto subdir2 = ConcatAbstractPath(dir2, "subdir2"); + auto subdir3 = ConcatAbstractPath(dir2, "subdir2/subdir3"); + // Creating dir2 with recursive=false should fail. + EXPECT_RAISES_WITH_MESSAGE_THAT( + IOError, ::testing::HasSubstr("Path does not exist '" + container2 + "'"), + fs()->CreateDir(dir2, /*recursive=*/false)); + AssertFileInfo(fs(), container2, FileType::NotFound); + AssertFileInfo(fs(), dir2, FileType::NotFound); + // Creating subdir2 with recursive=false should fail. + EXPECT_RAISES_WITH_MESSAGE_THAT( + IOError, ::testing::HasSubstr("Path does not exist '" + dir2 + "'"), + fs()->CreateDir(subdir2, /*recursive=*/false)); + AssertFileInfo(fs(), container2, FileType::NotFound); + AssertFileInfo(fs(), dir2, FileType::NotFound); + AssertFileInfo(fs(), subdir2, FileType::NotFound); + // Creating subdir3 with recursive=false should fail. + EXPECT_RAISES_WITH_MESSAGE_THAT( + IOError, ::testing::HasSubstr("Path does not exist '" + subdir2 + "'"), + fs()->CreateDir(subdir3, /*recursive=*/false)); + AssertFileInfo(fs(), container2, FileType::NotFound); + AssertFileInfo(fs(), dir2, FileType::NotFound); + AssertFileInfo(fs(), subdir2, FileType::NotFound); + AssertFileInfo(fs(), subdir3, FileType::NotFound); + // Creating subdir3 with recursive=true should work. + ASSERT_OK(fs()->CreateDir(subdir3, /*recursive=*/true)); + AssertFileInfo(fs(), container2, FileType::Directory); + AssertFileInfo(fs(), dir2, FileType::Directory); + AssertFileInfo(fs(), subdir2, FileType::Directory); + AssertFileInfo(fs(), subdir3, FileType::Directory); + } + void TestDeleteDirSuccessEmpty() { + if (HasSubmitBatchBug()) { + GTEST_SKIP() << kSubmitBatchBugMessage; + } auto data = SetUpPreexistingData(); const auto directory_path = data.RandomDirectoryPath(rng_); - if (WithHierarchicalNamespace()) { - ASSERT_OK(fs()->CreateDir(directory_path, true)); - AssertFileInfo(fs(), directory_path, FileType::Directory); - ASSERT_OK(fs()->DeleteDir(directory_path)); - AssertFileInfo(fs(), directory_path, FileType::NotFound); - } else { - // There is only virtual directory without hierarchical namespace - // support. So the CreateDir() and DeleteDir() do nothing. - ASSERT_OK(fs()->CreateDir(directory_path)); - AssertFileInfo(fs(), directory_path, FileType::NotFound); - ASSERT_OK(fs()->DeleteDir(directory_path)); - AssertFileInfo(fs(), directory_path, FileType::NotFound); - } + AssertFileInfo(fs(), directory_path, FileType::NotFound); + ASSERT_OK(fs()->CreateDir(directory_path, true)); + AssertFileInfo(fs(), directory_path, FileType::Directory); + ASSERT_OK(fs()->DeleteDir(directory_path)); + AssertFileInfo(fs(), directory_path, FileType::NotFound); } - void TestCreateDirSuccessContainerAndDirectory() { + void TestDeleteDirFailureNonexistent() { auto data = SetUpPreexistingData(); const auto path = data.RandomDirectoryPath(rng_); - ASSERT_OK(fs()->CreateDir(path, false)); - if (WithHierarchicalNamespace()) { - AssertFileInfo(fs(), path, FileType::Directory); - } else { - // There is only virtual directory without hierarchical namespace - // support. So the CreateDir() does nothing. - AssertFileInfo(fs(), path, FileType::NotFound); - } + ASSERT_RAISES(IOError, fs()->DeleteDir(path)); } - void TestCreateDirRecursiveSuccessContainerOnly() { - auto container_name = PreexistingData::RandomContainerName(rng_); - ASSERT_OK(fs()->CreateDir(container_name, true)); - AssertFileInfo(fs(), container_name, FileType::Directory); + void TestDeleteDirSuccessHaveBlob() { + if (HasSubmitBatchBug()) { + GTEST_SKIP() << kSubmitBatchBugMessage; + } + auto data = SetUpPreexistingData(); + const auto directory_path = data.RandomDirectoryPath(rng_); + const auto blob_path = ConcatAbstractPath(directory_path, "hello.txt"); + ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(blob_path)); + ASSERT_OK(output->Write("hello")); + ASSERT_OK(output->Close()); + AssertFileInfo(fs(), blob_path, FileType::File); + ASSERT_OK(fs()->DeleteDir(directory_path)); + AssertFileInfo(fs(), blob_path, FileType::NotFound); } - void TestCreateDirRecursiveSuccessDirectoryOnly() { + void TestDeleteDirSuccessHaveDirectory() { + if (HasSubmitBatchBug()) { + GTEST_SKIP() << kSubmitBatchBugMessage; + } auto data = SetUpPreexistingData(); const auto parent = data.RandomDirectoryPath(rng_); const auto path = ConcatAbstractPath(parent, "new-sub"); ASSERT_OK(fs()->CreateDir(path, true)); - if (WithHierarchicalNamespace()) { - AssertFileInfo(fs(), path, FileType::Directory); - AssertFileInfo(fs(), parent, FileType::Directory); - } else { - // There is only virtual directory without hierarchical namespace - // support. So the CreateDir() does nothing. - AssertFileInfo(fs(), path, FileType::NotFound); - AssertFileInfo(fs(), parent, FileType::NotFound); - } + AssertFileInfo(fs(), path, FileType::Directory); + AssertFileInfo(fs(), parent, FileType::Directory); + ASSERT_OK(fs()->DeleteDir(parent)); + AssertFileInfo(fs(), path, FileType::NotFound); + AssertFileInfo(fs(), parent, FileType::NotFound); } - void TestCreateDirRecursiveSuccessContainerAndDirectory() { - auto data = SetUpPreexistingData(); - const auto parent = data.RandomDirectoryPath(rng_); - const auto path = ConcatAbstractPath(parent, "new-sub"); - ASSERT_OK(fs()->CreateDir(path, true)); - if (WithHierarchicalNamespace()) { - AssertFileInfo(fs(), path, FileType::Directory); - AssertFileInfo(fs(), parent, FileType::Directory); - AssertFileInfo(fs(), data.container_name, FileType::Directory); - } else { - // There is only virtual directory without hierarchical namespace - // support. So the CreateDir() does nothing. - AssertFileInfo(fs(), path, FileType::NotFound); - AssertFileInfo(fs(), parent, FileType::NotFound); - AssertFileInfo(fs(), data.container_name, FileType::Directory); + void TestDeleteDirContentsSuccessExist() { + if (HasSubmitBatchBug()) { + GTEST_SKIP() << kSubmitBatchBugMessage; + } + auto preexisting_data = SetUpPreexistingData(); + HierarchicalPaths paths; + CreateHierarchicalData(&paths); + ASSERT_OK(fs()->DeleteDirContents(paths.directory)); + AssertFileInfo(fs(), paths.directory, FileType::Directory); + for (const auto& sub_path : paths.sub_paths) { + AssertFileInfo(fs(), sub_path, FileType::NotFound); } } void TestDeleteDirContentsSuccessNonexistent() { + if (HasSubmitBatchBug()) { + GTEST_SKIP() << kSubmitBatchBugMessage; + } auto data = SetUpPreexistingData(); const auto directory_path = data.RandomDirectoryPath(rng_); ASSERT_OK(fs()->DeleteDirContents(directory_path, true)); @@ -662,7 +843,7 @@ class TestAzureFileSystem : public ::testing::Test { void TestAzureFileSystem::TestDetectHierarchicalNamespace(bool trip_up_azurite) { EXPECT_OK_AND_ASSIGN(auto env, GetAzureEnv()); if (trip_up_azurite && env->backend() != AzureBackend::kAzurite) { - GTEST_SKIP() << "trip_up_azurite=true is only for Azurite."; + return; } auto data = SetUpPreexistingData(); @@ -704,22 +885,6 @@ void TestAzureFileSystem::TestDetectHierarchicalNamespaceOnMissingContainer() { } } -void TestAzureFileSystem::TestGetFileInfoObject() { - auto data = SetUpPreexistingData(); - auto object_properties = - blob_service_client_->GetBlobContainerClient(data.container_name) - .GetBlobClient(data.kObjectName) - .GetProperties() - .Value; - - AssertFileInfo(fs(), data.ObjectPath(), FileType::File, - std::chrono::system_clock::time_point{object_properties.LastModified}, - static_cast(object_properties.BlobSize)); - - // URI - ASSERT_RAISES(Invalid, fs()->GetFileInfo("abfs://" + std::string{data.kObjectName})); -} - void TestAzureFileSystem::TestGetFileInfoObjectWithNestedStructure() { auto data = SetUpPreexistingData(); // Adds detailed tests to handle cases of different edge cases @@ -855,6 +1020,16 @@ TYPED_TEST(TestAzureFileSystemOnAllEnvs, DetectHierarchicalNamespaceOnMissingCon this->TestDetectHierarchicalNamespaceOnMissingContainer(); } +TYPED_TEST(TestAzureFileSystemOnAllEnvs, GetFileInfoOfRoot) { + this->TestGetFileInfoOfRoot(); +} + +TYPED_TEST(TestAzureFileSystemOnAllEnvs, CreateDirWithEmptyPath) { + ASSERT_RAISES(Invalid, this->fs()->CreateDir("", false)); +} + +TYPED_TEST(TestAzureFileSystemOnAllEnvs, CreateDirOnRoot) { this->TestCreateDirOnRoot(); } + // Tests using all the 3 environments (Azurite, Azure w/o HNS (flat), Azure w/ HNS) // combined with the two scenarios for AzureFileSystem::cached_hns_support_ -- unknown and // known according to the environment. @@ -869,105 +1044,56 @@ using AllScenarios = ::testing::Types< TYPED_TEST_SUITE(TestAzureFileSystemOnAllScenarios, AllScenarios); -TYPED_TEST(TestAzureFileSystemOnAllScenarios, GetFileInfoObject) { - this->TestGetFileInfoObject(); +TYPED_TEST(TestAzureFileSystemOnAllScenarios, GetFileInfoOnExistingContainer) { + this->TestGetFileInfoOnExistingContainer(); } -TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteDirSuccessEmpty) { - this->TestDeleteDirSuccessEmpty(); +TYPED_TEST(TestAzureFileSystemOnAllScenarios, GetFileInfoOnMissingContainer) { + this->TestGetFileInfoOnMissingContainer(); } TYPED_TEST(TestAzureFileSystemOnAllScenarios, GetFileInfoObjectWithNestedStructure) { this->TestGetFileInfoObjectWithNestedStructure(); } -TYPED_TEST(TestAzureFileSystemOnAllScenarios, CreateDirSuccessContainerAndDirectory) { - this->TestCreateDirSuccessContainerAndDirectory(); +TYPED_TEST(TestAzureFileSystemOnAllScenarios, CreateDirOnExistingContainer) { + this->TestCreateDirOnExistingContainer(); } -TYPED_TEST(TestAzureFileSystemOnAllScenarios, CreateDirRecursiveSuccessContainerOnly) { - this->TestCreateDirRecursiveSuccessContainerOnly(); +TYPED_TEST(TestAzureFileSystemOnAllScenarios, CreateDirOnMissingContainer) { + this->TestCreateDirOnMissingContainer(); } -TYPED_TEST(TestAzureFileSystemOnAllScenarios, CreateDirRecursiveSuccessDirectoryOnly) { - this->TestCreateDirRecursiveSuccessDirectoryOnly(); +TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteDirSuccessEmpty) { + this->TestDeleteDirSuccessEmpty(); } -TYPED_TEST(TestAzureFileSystemOnAllScenarios, - CreateDirRecursiveSuccessContainerAndDirectory) { - this->TestCreateDirRecursiveSuccessContainerAndDirectory(); +TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteDirFailureNonexistent) { + this->TestDeleteDirFailureNonexistent(); } -// Tests using a real storage account *with Hierarchical Namespace enabled* - -TEST_F(TestAzureHierarchicalNSFileSystem, DeleteDirFailureNonexistent) { - auto data = SetUpPreexistingData(); - const auto path = data.RandomDirectoryPath(rng_); - ASSERT_RAISES(IOError, fs()->DeleteDir(path)); +TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteDirSuccessHaveBlob) { + this->TestDeleteDirSuccessHaveBlob(); } -TEST_F(TestAzureHierarchicalNSFileSystem, DeleteDirSuccessHaveBlob) { - auto data = SetUpPreexistingData(); - const auto directory_path = data.RandomDirectoryPath(rng_); - const auto blob_path = ConcatAbstractPath(directory_path, "hello.txt"); - ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(blob_path)); - ASSERT_OK(output->Write(std::string_view("hello"))); - ASSERT_OK(output->Close()); - AssertFileInfo(fs(), blob_path, FileType::File); - ASSERT_OK(fs()->DeleteDir(directory_path)); - AssertFileInfo(fs(), blob_path, FileType::NotFound); +TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteDirSuccessHaveDirectory) { + this->TestDeleteDirSuccessHaveDirectory(); } -TEST_F(TestAzureHierarchicalNSFileSystem, DeleteDirSuccessHaveDirectory) { - auto data = SetUpPreexistingData(); - const auto parent = data.RandomDirectoryPath(rng_); - const auto path = ConcatAbstractPath(parent, "new-sub"); - ASSERT_OK(fs()->CreateDir(path, true)); - AssertFileInfo(fs(), path, FileType::Directory); - AssertFileInfo(fs(), parent, FileType::Directory); - ASSERT_OK(fs()->DeleteDir(parent)); - AssertFileInfo(fs(), path, FileType::NotFound); - AssertFileInfo(fs(), parent, FileType::NotFound); -} - -TEST_F(TestAzureHierarchicalNSFileSystem, DeleteDirContentsSuccessExist) { - auto preexisting_data = SetUpPreexistingData(); - HierarchicalPaths paths; - CreateHierarchicalData(&paths); - ASSERT_OK(fs()->DeleteDirContents(paths.directory)); - AssertFileInfo(fs(), paths.directory, FileType::Directory); - for (const auto& sub_path : paths.sub_paths) { - AssertFileInfo(fs(), sub_path, FileType::NotFound); - } +TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteDirContentsSuccessExist) { + this->TestDeleteDirContentsSuccessExist(); } -TEST_F(TestAzureHierarchicalNSFileSystem, DeleteDirContentsSuccessNonexistent) { +TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteDirContentsSuccessNonexistent) { this->TestDeleteDirContentsSuccessNonexistent(); } -TEST_F(TestAzureHierarchicalNSFileSystem, DeleteDirContentsFailureNonexistent) { +TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteDirContentsFailureNonexistent) { this->TestDeleteDirContentsFailureNonexistent(); } // Tests using Azurite (the local Azure emulator) -TEST_F(TestAzuriteFileSystem, GetFileInfoAccount) { - AssertFileInfo(fs(), "", FileType::Directory); - - // URI - ASSERT_RAISES(Invalid, fs()->GetFileInfo("abfs://")); -} - -TEST_F(TestAzuriteFileSystem, GetFileInfoContainer) { - auto data = SetUpPreexistingData(); - AssertFileInfo(fs(), data.container_name, FileType::Directory); - - AssertFileInfo(fs(), "nonexistent-container", FileType::NotFound); - - // URI - ASSERT_RAISES(Invalid, fs()->GetFileInfo("abfs://" + data.container_name)); -} - TEST_F(TestAzuriteFileSystem, GetFileInfoSelector) { SetUpSmallFileSystemTree(); @@ -1141,16 +1267,6 @@ TEST_F(TestAzuriteFileSystem, GetFileInfoSelectorExplicitImplicitDirDedup) { AssertFileInfo(infos[0], "container/mydir/nonemptydir2/somefile", FileType::File); } -TEST_F(TestAzuriteFileSystem, CreateDirFailureNoContainer) { - ASSERT_RAISES(Invalid, fs()->CreateDir("", false)); -} - -TEST_F(TestAzuriteFileSystem, CreateDirSuccessContainerOnly) { - auto container_name = PreexistingData::RandomContainerName(rng_); - ASSERT_OK(fs()->CreateDir(container_name, false)); - AssertFileInfo(fs(), container_name, FileType::Directory); -} - TEST_F(TestAzuriteFileSystem, CreateDirFailureDirectoryWithMissingContainer) { const auto path = std::string("not-a-container/new-directory"); ASSERT_RAISES(IOError, fs()->CreateDir(path, false)); @@ -1175,19 +1291,20 @@ TEST_F(TestAzuriteFileSystem, DeleteDirSuccessContainer) { } TEST_F(TestAzuriteFileSystem, DeleteDirSuccessNonexistent) { + if (HasSubmitBatchBug()) { + GTEST_SKIP() << kSubmitBatchBugMessage; + } auto data = SetUpPreexistingData(); const auto directory_path = data.RandomDirectoryPath(rng_); - // There is only virtual directory without hierarchical namespace - // support. So the DeleteDir() for nonexistent directory does nothing. - ASSERT_OK(fs()->DeleteDir(directory_path)); + // DeleteDir() fails if the directory doesn't exist. + ASSERT_RAISES(IOError, fs()->DeleteDir(directory_path)); AssertFileInfo(fs(), directory_path, FileType::NotFound); } TEST_F(TestAzuriteFileSystem, DeleteDirSuccessHaveBlobs) { -#ifdef __APPLE__ - GTEST_SKIP() << "This test fails by an Azurite problem: " - "https://github.com/Azure/Azurite/pull/2302"; -#endif + if (HasSubmitBatchBug()) { + GTEST_SKIP() << kSubmitBatchBugMessage; + } auto data = SetUpPreexistingData(); const auto directory_path = data.RandomDirectoryPath(rng_); // We must use 257 or more blobs here to test pagination of ListBlobs(). @@ -1213,10 +1330,9 @@ TEST_F(TestAzuriteFileSystem, DeleteDirUri) { } TEST_F(TestAzuriteFileSystem, DeleteDirContentsSuccessContainer) { -#ifdef __APPLE__ - GTEST_SKIP() << "This test fails by an Azurite problem: " - "https://github.com/Azure/Azurite/pull/2302"; -#endif + if (HasSubmitBatchBug()) { + GTEST_SKIP() << kSubmitBatchBugMessage; + } auto data = SetUpPreexistingData(); HierarchicalPaths paths; CreateHierarchicalData(&paths); @@ -1229,16 +1345,14 @@ TEST_F(TestAzuriteFileSystem, DeleteDirContentsSuccessContainer) { } TEST_F(TestAzuriteFileSystem, DeleteDirContentsSuccessDirectory) { -#ifdef __APPLE__ - GTEST_SKIP() << "This test fails by an Azurite problem: " - "https://github.com/Azure/Azurite/pull/2302"; -#endif + if (HasSubmitBatchBug()) { + GTEST_SKIP() << kSubmitBatchBugMessage; + } auto data = SetUpPreexistingData(); HierarchicalPaths paths; CreateHierarchicalData(&paths); ASSERT_OK(fs()->DeleteDirContents(paths.directory)); - // GH-38772: We may change this to FileType::Directory. - AssertFileInfo(fs(), paths.directory, FileType::NotFound); + AssertFileInfo(fs(), paths.directory, FileType::Directory); for (const auto& sub_path : paths.sub_paths) { AssertFileInfo(fs(), sub_path, FileType::NotFound); } From 01deb9438acde11f1968acd2a0bb5d3e8e4a4cc6 Mon Sep 17 00:00:00 2001 From: mwish Date: Fri, 5 Jan 2024 23:44:52 +0800 Subject: [PATCH 136/570] GH-39419: [C++][Parquet] Style: Using arrow::Buffer data_as api rather than reinterpret_cast (#39420) ### Rationale for this change This patch using `{mutable}_data_as()` api to replace `interpret_cast<{const} T*>`. It's just a style fixing. ### What changes are included in this PR? Just api replacement for `::arrow::Buffer` * `reinterpret_cast` -> `mutable_data_as()` * `reinterpret_cast` -> `data_as()` Also, for `auto {variable_name} = reinterpret_cast<{mutable} T*>( ... )`, I changed it to: 1. `const auto*` for `data_as()`. 2. `auto*` for `mutable_data_as()` This didn't change the syntax, but make it more readable. ### Are these changes tested? No need ### Are there any user-facing changes? no * Closes: #39419 * Authored-by: mwish Signed-off-by: mwish --- cpp/src/parquet/encoding.cc | 74 +++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 41 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 840efa12cc3c1..b07ad6c9fb062 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -125,7 +125,7 @@ class PlainEncoder : public EncoderImpl, virtual public TypedEncoder { if (valid_bits != NULLPTR) { PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T), this->memory_pool())); - T* data = reinterpret_cast(buffer->mutable_data()); + T* data = buffer->template mutable_data_as(); int num_valid_values = ::arrow::util::internal::SpacedCompress( src, num_values, valid_bits, valid_bits_offset, data); Put(data, num_valid_values); @@ -323,7 +323,7 @@ class PlainEncoder : public EncoderImpl, virtual public BooleanEnco if (valid_bits != NULLPTR) { PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T), this->memory_pool())); - T* data = reinterpret_cast(buffer->mutable_data()); + T* data = buffer->mutable_data_as(); int num_valid_values = ::arrow::util::internal::SpacedCompress( src, num_values, valid_bits, valid_bits_offset, data); Put(data, num_valid_values); @@ -882,7 +882,7 @@ void ByteStreamSplitEncoder::PutSpaced(const T* src, int num_values, if (valid_bits != NULLPTR) { PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T), this->memory_pool())); - T* data = reinterpret_cast(buffer->mutable_data()); + T* data = buffer->template mutable_data_as(); int num_valid_values = ::arrow::util::internal::SpacedCompress( src, num_values, valid_bits, valid_bits_offset, data); Put(data, num_valid_values); @@ -1080,7 +1080,7 @@ inline int DecodePlain(const uint8_t* data, int64_t data_size ParquetException::EofException(); } for (int i = 0; i < num_values; ++i) { - out[i].ptr = data + i * type_length; + out[i].ptr = data + i * static_cast(type_length); } return static_cast(bytes_to_decode); } @@ -1537,9 +1537,8 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { int Decode(T* buffer, int num_values) override { num_values = std::min(num_values, num_values_); - int decoded_values = - idx_decoder_.GetBatchWithDict(reinterpret_cast(dictionary_->data()), - dictionary_length_, buffer, num_values); + int decoded_values = idx_decoder_.GetBatchWithDict( + dictionary_->data_as(), dictionary_length_, buffer, num_values); if (decoded_values != num_values) { ParquetException::EofException(); } @@ -1551,9 +1550,8 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { int64_t valid_bits_offset) override { num_values = std::min(num_values, num_values_); if (num_values != idx_decoder_.GetBatchWithDictSpaced( - reinterpret_cast(dictionary_->data()), - dictionary_length_, buffer, num_values, null_count, valid_bits, - valid_bits_offset)) { + dictionary_->data_as(), dictionary_length_, buffer, + num_values, null_count, valid_bits, valid_bits_offset)) { ParquetException::EofException(); } num_values_ -= num_values; @@ -1580,8 +1578,7 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { num_values, /*shrink_to_fit=*/false)); } - auto indices_buffer = - reinterpret_cast(indices_scratch_space_->mutable_data()); + auto indices_buffer = indices_scratch_space_->mutable_data_as(); if (num_values != idx_decoder_.GetBatchSpaced(num_values, null_count, valid_bits, valid_bits_offset, indices_buffer)) { @@ -1611,8 +1608,7 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { PARQUET_THROW_NOT_OK(indices_scratch_space_->TypedResize( num_values, /*shrink_to_fit=*/false)); } - auto indices_buffer = - reinterpret_cast(indices_scratch_space_->mutable_data()); + auto indices_buffer = indices_scratch_space_->mutable_data_as(); if (num_values != idx_decoder_.GetBatch(indices_buffer, num_values)) { ParquetException::EofException(); } @@ -1632,7 +1628,7 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { void GetDictionary(const T** dictionary, int32_t* dictionary_length) override { *dictionary_length = dictionary_length_; - *dictionary = reinterpret_cast(dictionary_->mutable_data()); + *dictionary = dictionary_->mutable_data_as(); } protected: @@ -1647,8 +1643,7 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { dictionary_length_ = static_cast(dictionary->values_left()); PARQUET_THROW_NOT_OK(dictionary_->Resize(dictionary_length_ * sizeof(T), /*shrink_to_fit=*/false)); - dictionary->Decode(reinterpret_cast(dictionary_->mutable_data()), - dictionary_length_); + dictionary->Decode(dictionary_->mutable_data_as(), dictionary_length_); } // Only one is set. @@ -1688,7 +1683,7 @@ template <> void DictDecoderImpl::SetDict(TypedDecoder* dictionary) { DecodeDict(dictionary); - auto dict_values = reinterpret_cast(dictionary_->mutable_data()); + auto* dict_values = dictionary_->mutable_data_as(); int total_size = 0; for (int i = 0; i < dictionary_length_; ++i) { @@ -1702,8 +1697,7 @@ void DictDecoderImpl::SetDict(TypedDecoder* dictio int32_t offset = 0; uint8_t* bytes_data = byte_array_data_->mutable_data(); - int32_t* bytes_offsets = - reinterpret_cast(byte_array_offsets_->mutable_data()); + int32_t* bytes_offsets = byte_array_offsets_->mutable_data_as(); for (int i = 0; i < dictionary_length_; ++i) { memcpy(bytes_data + offset, dict_values[i].ptr, dict_values[i].len); bytes_offsets[i] = offset; @@ -1717,7 +1711,7 @@ template <> inline void DictDecoderImpl::SetDict(TypedDecoder* dictionary) { DecodeDict(dictionary); - auto dict_values = reinterpret_cast(dictionary_->mutable_data()); + auto* dict_values = dictionary_->mutable_data_as(); int fixed_len = descr_->type_length(); int total_size = dictionary_length_ * fixed_len; @@ -1765,7 +1759,7 @@ int DictDecoderImpl::DecodeArrow( typename EncodingTraits::DictAccumulator* builder) { PARQUET_THROW_NOT_OK(builder->Reserve(num_values)); - auto dict_values = reinterpret_cast(dictionary_->data()); + const auto* dict_values = dictionary_->data_as(); VisitNullBitmapInline( valid_bits, valid_bits_offset, num_values, null_count, @@ -1801,7 +1795,7 @@ inline int DictDecoderImpl::DecodeArrow( PARQUET_THROW_NOT_OK(builder->Reserve(num_values)); - auto dict_values = reinterpret_cast(dictionary_->data()); + const auto* dict_values = dictionary_->data_as(); VisitNullBitmapInline( valid_bits, valid_bits_offset, num_values, null_count, @@ -1834,7 +1828,7 @@ int DictDecoderImpl::DecodeArrow( PARQUET_THROW_NOT_OK(builder->Reserve(num_values)); - auto dict_values = reinterpret_cast(dictionary_->data()); + const auto* dict_values = dictionary_->data_as(); VisitNullBitmapInline( valid_bits, valid_bits_offset, num_values, null_count, @@ -1858,7 +1852,7 @@ int DictDecoderImpl::DecodeArrow( PARQUET_THROW_NOT_OK(builder->Reserve(num_values)); using value_type = typename Type::c_type; - auto dict_values = reinterpret_cast(dictionary_->data()); + const auto* dict_values = dictionary_->data_as(); VisitNullBitmapInline( valid_bits, valid_bits_offset, num_values, null_count, @@ -1936,7 +1930,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, // space for binary data. RETURN_NOT_OK(helper.Prepare()); - auto dict_values = reinterpret_cast(dictionary_->data()); + const auto* dict_values = dictionary_->data_as(); int values_decoded = 0; int num_indices = 0; int pos_indices = 0; @@ -2007,7 +2001,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, // space for binary data. RETURN_NOT_OK(helper.Prepare()); - auto dict_values = reinterpret_cast(dictionary_->data()); + const auto* dict_values = dictionary_->data_as(); while (values_decoded < num_values) { const int32_t batch_size = @@ -2037,7 +2031,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, RETURN_NOT_OK(builder->Reserve(num_values)); ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values); - auto dict_values = reinterpret_cast(dictionary_->data()); + const auto* dict_values = dictionary_->data_as(); int values_decoded = 0; int num_appended = 0; @@ -2090,7 +2084,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, RETURN_NOT_OK(builder->Reserve(num_values)); - auto dict_values = reinterpret_cast(dictionary_->data()); + const auto* dict_values = dictionary_->data_as(); int values_decoded = 0; while (values_decoded < num_values) { @@ -2388,7 +2382,7 @@ void DeltaBitPackEncoder::PutSpaced(const T* src, int num_values, if (valid_bits != NULLPTR) { PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T), this->memory_pool())); - T* data = reinterpret_cast(buffer->mutable_data()); + T* data = buffer->template mutable_data_as(); int num_valid_values = ::arrow::util::internal::SpacedCompress( src, num_values, valid_bits, valid_bits_offset, data); Put(data, num_valid_values); @@ -2734,7 +2728,7 @@ void DeltaLengthByteArrayEncoder::PutSpaced(const T* src, int num_values, if (valid_bits != NULLPTR) { PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T), this->memory_pool())); - T* data = reinterpret_cast(buffer->mutable_data()); + T* data = buffer->template mutable_data_as(); int num_valid_values = ::arrow::util::internal::SpacedCompress( src, num_values, valid_bits, valid_bits_offset, data); Put(data, num_valid_values); @@ -2789,8 +2783,7 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl, } int32_t data_size = 0; - const int32_t* length_ptr = - reinterpret_cast(buffered_length_->data()) + length_idx_; + const int32_t* length_ptr = buffered_length_->data_as() + length_idx_; int bytes_offset = len_ - decoder_->bytes_left(); for (int i = 0; i < max_values; ++i) { int32_t len = length_ptr[i]; @@ -2844,8 +2837,8 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl, // call len_decoder_.Decode to decode all the lengths. // all the lengths are buffered in buffered_length_. - int ret = len_decoder_.Decode( - reinterpret_cast(buffered_length_->mutable_data()), num_length); + int ret = + len_decoder_.Decode(buffered_length_->mutable_data_as(), num_length); DCHECK_EQ(ret, num_length); length_idx_ = 0; num_valid_values_ = num_length; @@ -2938,7 +2931,7 @@ class RleBooleanEncoder final : public EncoderImpl, virtual public BooleanEncode if (valid_bits != NULLPTR) { PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T), this->memory_pool())); - T* data = reinterpret_cast(buffer->mutable_data()); + T* data = buffer->mutable_data_as(); int num_valid_values = ::arrow::util::internal::SpacedCompress( src, num_values, valid_bits, valid_bits_offset, data); Put(data, num_valid_values); @@ -3136,7 +3129,7 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
    Resize(num_values * sizeof(T), false)); } - T* data = reinterpret_cast(buffer_->mutable_data()); + T* data = buffer_->mutable_data_as(); int num_valid_values = ::arrow::util::internal::SpacedCompress( src, num_values, valid_bits, valid_bits_offset, data); Put(data, num_valid_values); @@ -3338,7 +3331,7 @@ class DeltaByteArrayDecoderImpl : public DecoderImpl, virtual public TypedDecode // all the prefix lengths are buffered in buffered_prefix_length_. PARQUET_THROW_NOT_OK(buffered_prefix_length_->Resize(num_prefix * sizeof(int32_t))); int ret = prefix_len_decoder_.Decode( - reinterpret_cast(buffered_prefix_length_->mutable_data()), num_prefix); + buffered_prefix_length_->mutable_data_as(), num_prefix); DCHECK_EQ(ret, num_prefix); prefix_len_offset_ = 0; num_valid_values_ = num_prefix; @@ -3425,8 +3418,7 @@ class DeltaByteArrayDecoderImpl : public DecoderImpl, virtual public TypedDecode int64_t data_size = 0; const int32_t* prefix_len_ptr = - reinterpret_cast(buffered_prefix_length_->data()) + - prefix_len_offset_; + buffered_prefix_length_->data_as() + prefix_len_offset_; for (int i = 0; i < max_values; ++i) { if (prefix_len_ptr[i] == 0) { // We don't need to copy the suffix if the prefix length is 0. @@ -3578,7 +3570,7 @@ class ByteStreamSplitDecoder : public DecoderImpl, virtual public TypedDecodersize() < size) { PARQUET_ASSIGN_OR_THROW(decode_buffer_, ::arrow::AllocateBuffer(size)); } - return reinterpret_cast(decode_buffer_->mutable_data()); + return decode_buffer_->mutable_data_as(); } private: From b736c99cea9e6b86475e8f2ce264ede3262a237c Mon Sep 17 00:00:00 2001 From: David Li Date: Fri, 5 Jan 2024 12:54:42 -0500 Subject: [PATCH 137/570] GH-39468: [Java] Fix site build for docs (#39471) ### Rationale for this change Pin plugins we use for docs build. ### Are there any user-facing changes? No. * Closes: #39468 Authored-by: David Li Signed-off-by: David Li --- java/bom/pom.xml | 30 ++++++++++++++++++++++++++++++ java/maven/pom.xml | 37 +++++++++++++++++++++++++++++++++++++ java/performance/pom.xml | 4 ---- java/pom.xml | 30 ++++++++++++++++++++++++++++++ 4 files changed, 97 insertions(+), 4 deletions(-) diff --git a/java/bom/pom.xml b/java/bom/pom.xml index 1f6f854f60013..5c2ed33dadddf 100644 --- a/java/bom/pom.xml +++ b/java/bom/pom.xml @@ -145,4 +145,34 @@ + + + + + org.apache.maven.plugins + maven-project-info-reports-plugin + 3.5.0 + + + org.apache.maven.plugins + maven-site-plugin + 3.7.1 + + + + + + + + org.apache.maven.plugins + maven-project-info-reports-plugin + 3.5.0 + + + org.apache.maven.plugins + maven-site-plugin + 3.7.1 + + + diff --git a/java/maven/pom.xml b/java/maven/pom.xml index 0923984c8e5e5..56f3c4c434f64 100644 --- a/java/maven/pom.xml +++ b/java/maven/pom.xml @@ -302,7 +302,44 @@ + + + org.apache.maven.plugins + maven-project-info-reports-plugin + + + org.apache.maven.plugins + maven-site-plugin + + + + + org.apache.maven.plugins + maven-project-info-reports-plugin + 3.5.0 + + + org.apache.maven.plugins + maven-site-plugin + 3.7.1 + + + + + + + org.apache.maven.plugins + maven-project-info-reports-plugin + 3.0.0 + + + org.apache.maven.plugins + maven-site-plugin + 3.7.1 + + + diff --git a/java/performance/pom.xml b/java/performance/pom.xml index 4d449af46b6b1..13300c2ac834f 100644 --- a/java/performance/pom.xml +++ b/java/performance/pom.xml @@ -201,10 +201,6 @@ maven-resources-plugin 3.3.1 - - maven-site-plugin - 3.3 - maven-source-plugin 2.2.1 diff --git a/java/pom.xml b/java/pom.xml index fae072018eb19..6b7192fd33efc 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -390,6 +390,16 @@ + + org.apache.maven.plugins + maven-project-info-reports-plugin + 3.0.0 + + + org.apache.maven.plugins + maven-site-plugin + 3.7.1 + @@ -572,6 +582,16 @@ module-info-compiler-maven-plugin ${project.version} + + org.apache.maven.plugins + maven-project-info-reports-plugin + 3.0.0 + + + org.apache.maven.plugins + maven-site-plugin + 3.7.1 + @@ -757,6 +777,16 @@ + + org.apache.maven.plugins + maven-project-info-reports-plugin + 3.0.0 + + + org.apache.maven.plugins + maven-site-plugin + 3.7.1 + From 38af25808e7826fb64265a78b2ed36b3882499f9 Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Fri, 5 Jan 2024 17:57:06 +0000 Subject: [PATCH 138/570] GH-39048: [JS] Re-export existing type enums (#39473) This way we don't have to manually maintain the enums even though they should be pretty much constants. * Closes: #39048 --- js/src/enum.ts | 120 +++---------------------------------------------- 1 file changed, 7 insertions(+), 113 deletions(-) diff --git a/js/src/enum.ts b/js/src/enum.ts index 0eecc0c68b525..e4284e42774ad 100644 --- a/js/src/enum.ts +++ b/js/src/enum.ts @@ -15,119 +15,13 @@ // specific language governing permissions and limitations // under the License. -//// -// -// A few enums copied from `fb/Schema.ts` and `fb/Message.ts` because Webpack -// v4 doesn't seem to be able to tree-shake the rest of those exports. -// -// We will have to keep these enums in sync when we re-generate the flatbuffers -// code from the schemas. See js/DEVELOP.md for info on how to run flatbuffers -// code generation. -// -//// - -/** - * Logical types, vector layouts, and schemas - * - * @enum {number} - */ -export enum MetadataVersion { - /** - * 0.1.0 (October 2016). - */ - V1 = 0, - - /** - * 0.2.0 (February 2017). Non-backwards compatible with V1. - */ - V2 = 1, - - /** - * 0.3.0 -> 0.7.1 (May - December 2017). Non-backwards compatible with V2. - */ - V3 = 2, - - /** - * >= 0.8.0 (December 2017). Non-backwards compatible with V3. - */ - V4 = 3, - - /** - * >= 1.0.0 (July 2020. Backwards compatible with V4 (V5 readers can read V4 - * metadata and IPC messages). Implementations are recommended to provide a - * V4 compatibility mode with V5 format changes disabled. - * - * Incompatible changes between V4 and V5: - * - Union buffer layout has changed. In V5, Unions don't have a validity - * bitmap buffer. - */ - V5 = 4 -} - -/** - * @enum {number} - */ -export enum UnionMode { - Sparse = 0, - Dense = 1 -} - -/** - * @enum {number} - */ -export enum Precision { - HALF = 0, - SINGLE = 1, - DOUBLE = 2 -} - -/** - * @enum {number} - */ -export enum DateUnit { - DAY = 0, - MILLISECOND = 1 -} - -/** - * @enum {number} - */ -export enum TimeUnit { - SECOND = 0, - MILLISECOND = 1, - MICROSECOND = 2, - NANOSECOND = 3 -} - -/** - * @enum {number} - */ -export enum IntervalUnit { - YEAR_MONTH = 0, - DAY_TIME = 1, - MONTH_DAY_NANO = 2 -} - -/** - * ---------------------------------------------------------------------- - * The root Message type - * This union enables us to easily send different message types without - * redundant storage, and in the future we can easily add new message types. - * - * Arrow implementations do not need to implement all of the message types, - * which may include experimental metadata types. For maximum compatibility, - * it is best to send data using RecordBatch - * - * @enum {number} - */ -export enum MessageHeader { - NONE = 0, - Schema = 1, - DictionaryBatch = 2, - RecordBatch = 3, - Tensor = 4, - SparseTensor = 5 -} +export { MetadataVersion } from './fb/metadata-version.js'; +export { UnionMode } from './fb/union-mode.js'; +export { Precision } from './fb/precision.js'; +export { DateUnit } from './fb/date-unit.js'; +export { TimeUnit } from './fb/time-unit.js'; +export { IntervalUnit } from './fb/interval-unit.js'; +export { MessageHeader } from './fb/message-header.js'; /** * Main data type enumeration. From 9b931af14e5a710cba0aaa6b899e2ca696bfd785 Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Fri, 5 Jan 2024 19:35:36 +0000 Subject: [PATCH 139/570] GH-39477: [JS] remove esModuleInterop (#39478) * Closes: #39477 Also removes a dependency for pad left since it's built in now: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/padStart. --- js/package.json | 5 +- js/src/bin/arrow2csv.ts | 9 ++- js/test/generate-test-data.ts | 5 +- js/test/random-string.ts | 43 ++++++++++++ js/test/unit/builders/utils.ts | 5 +- js/test/unit/ipc/helpers.ts | 8 +-- js/test/unit/ipc/reader/streams-node-tests.ts | 3 - js/test/unit/utils.ts | 2 +- js/tsconfig.json | 1 - js/tsconfig/tsconfig.base.json | 1 - js/yarn.lock | 68 +++++++++---------- 11 files changed, 89 insertions(+), 61 deletions(-) create mode 100644 js/test/random-string.ts diff --git a/js/package.json b/js/package.json index d72fdd3177016..eb24947ce78b8 100644 --- a/js/package.json +++ b/js/package.json @@ -56,12 +56,10 @@ "@types/command-line-args": "^5.2.1", "@types/command-line-usage": "^5.0.2", "@types/node": "^20.6.0", - "@types/pad-left": "^2.1.1", "command-line-args": "^5.2.1", "command-line-usage": "^7.0.1", "flatbuffers": "^23.5.26", "json-bignum": "^0.0.3", - "pad-left": "^2.1.0", "tslib": "^2.6.2" }, "devDependencies": { @@ -73,7 +71,7 @@ "@types/benchmark": "2.1.4", "@types/glob": "8.1.0", "@types/jest": "29.5.3", - "@types/randomatic": "3.1.3", + "@types/multistream": "4.1.3", "@typescript-eslint/eslint-plugin": "5.62.0", "@typescript-eslint/parser": "5.59.9", "async-done": "2.0.0", @@ -104,7 +102,6 @@ "memfs": "4.5.0", "mkdirp": "3.0.1", "multistream": "4.1.0", - "randomatic": "3.1.1", "regenerator-runtime": "0.14.0", "rollup": "4.3.0", "rxjs": "7.8.1", diff --git a/js/src/bin/arrow2csv.ts b/js/src/bin/arrow2csv.ts index 39db8c17497cd..4115f30099f03 100755 --- a/js/src/bin/arrow2csv.ts +++ b/js/src/bin/arrow2csv.ts @@ -23,9 +23,8 @@ import * as fs from 'fs'; import * as stream from 'stream'; import { Schema, RecordBatch, RecordBatchReader, AsyncByteQueue, util } from '../Arrow.js'; -import commandLineUsage from 'command-line-usage'; -import commandLineArgs from 'command-line-args'; -import padLeft from 'pad-left'; +import * as commandLineUsage from 'command-line-usage'; +import * as commandLineArgs from 'command-line-args'; // @ts-ignore import { parse as bignumJSONParse } from 'json-bignum'; @@ -190,11 +189,11 @@ function batchesToString(state: ToStringState, schema: Schema) { } function horizontalRule(maxColWidths: number[], hr = '', sep = ' | ') { - return ` ${padLeft('', maxColWidths.reduce((x, y) => x + y, -2 + maxColWidths.length * sep.length), hr)}`; + return ` ${''.padStart(maxColWidths.reduce((x, y) => x + y, -2 + maxColWidths.length * sep.length), hr)}`; } function formatRow(row: string[] = [], maxColWidths: number[] = [], sep = ' | ') { - return `${row.map((x, j) => padLeft(x, maxColWidths[j])).join(sep)}`; + return row.map((x, j) => x.padStart(maxColWidths[j])).join(sep); } function formatMetadataValue(value = '') { diff --git a/js/test/generate-test-data.ts b/js/test/generate-test-data.ts index be248ad2c6ed8..8e6e47de836eb 100644 --- a/js/test/generate-test-data.ts +++ b/js/test/generate-test-data.ts @@ -15,8 +15,6 @@ // specific language governing permissions and limitations // under the License. -import randomatic from 'randomatic'; - import { makeData, Vector, Visitor, DataType, TypeMap, Table, Schema, Field, RecordBatch, @@ -43,6 +41,8 @@ import { util } from 'apache-arrow'; +import { randomString } from './random-string.js'; + type TKeys = Int8 | Int16 | Int32 | Uint8 | Uint16 | Uint32; interface TestDataVectorGenerator extends Visitor { @@ -650,7 +650,6 @@ type TypedArrayConstructor = const rand = Math.random.bind(Math); const randomBytes = (length: number) => fillRandom(Uint8Array, length); -const randomString = (length: number) => randomatic('?', length, { chars: `abcdefghijklmnopqrstuvwxyz0123456789_` }); const memoize = (fn: () => any) => ((x?: any) => () => x || (x = fn()))(); diff --git a/js/test/random-string.ts b/js/test/random-string.ts new file mode 100644 index 0000000000000..a70af451d4220 --- /dev/null +++ b/js/test/random-string.ts @@ -0,0 +1,43 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +export const LOWER = 'abcdefghijklmnopqrstuvwxyz'; +export const UPPER = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'; +export const NUMBER = '0123456789'; +export const SPECIAL = '~!@#$%^&()_+-={}[];\',.'; + +export const ALL = LOWER + UPPER + NUMBER + SPECIAL; + +/** + * Generate random string of specified `length` for the given `pattern`. + * + * @param `pattern` The pattern to use for generating the random string. + * @param `length` The length of the string to generate. + * @param `options` + */ +export function randomString(length: number, characters: string = `${LOWER + NUMBER}_`) { + let result = ''; + + while (length--) { + result += characters.charAt(Math.floor(Math.random() * characters.length)); + } + return result; +} + + +10; + diff --git a/js/test/unit/builders/utils.ts b/js/test/unit/builders/utils.ts index fbd8eb49eee7e..db4e80d002778 100644 --- a/js/test/unit/builders/utils.ts +++ b/js/test/unit/builders/utils.ts @@ -20,9 +20,9 @@ import 'web-streams-polyfill'; import { from, fromDOMStream, toArray } from 'ix/asynciterable'; import { fromNodeStream } from 'ix/asynciterable/fromnodestream'; import 'ix/Ix.node'; -import randstr from 'randomatic'; import '../../jest-extensions.js'; +import { randomString } from '../../random-string.js'; import { Builder, makeBuilder, builderThroughIterable, DataType, util, Vector } from 'apache-arrow'; @@ -30,9 +30,6 @@ const rand = Math.random.bind(Math); const randnulls = (values: T[], n: TNull = null) => values.map((x) => Math.random() > 0.25 ? x : n) as (T | TNull)[]; export const randomBytes = (length: number) => fillRandom(Uint8Array, length); -export const randomString = ((opts) => (length: number) => - randstr('?', length, opts) -)({ chars: `abcdefghijklmnopqrstuvwxyz0123456789_` }); export const stringsNoNulls = (length = 20) => Array.from({ length }, (_) => randomString(1 + (Math.trunc(Math.random() * 19)))); export const timestamp32sNoNulls = (length = 20, now = Math.trunc(Date.now() / 86400000)) => diff --git a/js/test/unit/ipc/helpers.ts b/js/test/unit/ipc/helpers.ts index a09cd01799a5f..2a228aa7abf18 100644 --- a/js/test/unit/ipc/helpers.ts +++ b/js/test/unit/ipc/helpers.ts @@ -15,11 +15,8 @@ // specific language governing permissions and limitations // under the License. -import '../../jest-extensions.js'; - import * as fs from 'fs'; import { fs as memfs } from 'memfs'; -import randomatic from 'randomatic'; import { PassThrough, Readable } from 'stream'; import { @@ -30,6 +27,9 @@ import { Table } from 'apache-arrow'; +import '../../jest-extensions.js'; +import { LOWER, NUMBER, randomString } from '../../random-string.js'; + export abstract class ArrowIOTestHelper { constructor(public table: Table) { } @@ -40,7 +40,7 @@ export abstract class ArrowIOTestHelper { protected abstract writer(table: Table): RecordBatchWriter; protected async filepath(table: Table): Promise { - const path = `/${randomatic('a0', 20)}.arrow`; + const path = `/${randomString(20, LOWER + NUMBER)}.arrow`; const data = await this.writer(table).toUint8Array(); await memfs.promises.writeFile(path, data); return path; diff --git a/js/test/unit/ipc/reader/streams-node-tests.ts b/js/test/unit/ipc/reader/streams-node-tests.ts index 24dd92fb5712a..2e3f08c4e7837 100644 --- a/js/test/unit/ipc/reader/streams-node-tests.ts +++ b/js/test/unit/ipc/reader/streams-node-tests.ts @@ -100,7 +100,6 @@ import { } it('readAll() should pipe to separate NodeJS WritableStreams', async () => { - // @ts-ignore const { default: MultiStream } = await import('multistream'); const { PassThrough } = await import('stream'); @@ -138,7 +137,6 @@ import { }); it('should not close the underlying NodeJS ReadableStream when reading multiple tables to completion', async () => { - // @ts-ignore const { default: MultiStream } = await import('multistream'); expect.hasAssertions(); @@ -168,7 +166,6 @@ import { }); it('should close the underlying NodeJS ReadableStream when reading multiple tables and we break early', async () => { - // @ts-ignore const { default: MultiStream } = await import('multistream'); expect.hasAssertions(); diff --git a/js/test/unit/utils.ts b/js/test/unit/utils.ts index c57de487f9edb..8f0a99c4a8616 100644 --- a/js/test/unit/utils.ts +++ b/js/test/unit/utils.ts @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -export function arange(arr: T, n = arr.length) { +export function arange(arr: T, n = arr.length) { for (let i = -1; ++i < n; arr[i] = i) { } return arr; } diff --git a/js/tsconfig.json b/js/tsconfig.json index abdd1815a0d98..96f457b50fb82 100644 --- a/js/tsconfig.json +++ b/js/tsconfig.json @@ -9,7 +9,6 @@ "module": "ESNext", "isolatedModules": true, "noEmit": true, - "esModuleInterop": true, "baseUrl": "./", "rootDir": "./", "paths": { diff --git a/js/tsconfig/tsconfig.base.json b/js/tsconfig/tsconfig.base.json index 0d7fefd90949f..874ea9f52f0d8 100644 --- a/js/tsconfig/tsconfig.base.json +++ b/js/tsconfig/tsconfig.base.json @@ -12,7 +12,6 @@ /* Basic stuff */ "moduleResolution": "Node", "lib": ["DOM", "ESNext", "ESNext.AsyncIterable"], - "esModuleInterop": true, /* Control what is emitted */ "declaration": true, diff --git a/js/yarn.lock b/js/yarn.lock index bf22cce197c6b..cef6357e02e44 100644 --- a/js/yarn.lock +++ b/js/yarn.lock @@ -65,6 +65,14 @@ "@babel/highlight" "^7.22.13" chalk "^2.4.2" +"@babel/code-frame@^7.22.5": + version "7.23.5" + resolved "https://registry.yarnpkg.com/@babel/code-frame/-/code-frame-7.23.5.tgz#9009b69a8c602293476ad598ff53e4562e15c244" + integrity sha512-CgH3s1a96LipHCmSUmYFPwY7MNx8C3avkq7i4Wl3cfa662ldtUe4VM1TPXX70pfmrlWTb6jLqTYrZyT2ZTJBgA== + dependencies: + "@babel/highlight" "^7.23.4" + chalk "^2.4.2" + "@babel/compat-data@^7.22.9": version "7.23.2" resolved "https://registry.yarnpkg.com/@babel/compat-data/-/compat-data-7.23.2.tgz#6a12ced93455827037bfb5ed8492820d60fc32cc" @@ -217,6 +225,15 @@ chalk "^2.4.2" js-tokens "^4.0.0" +"@babel/highlight@^7.23.4": + version "7.23.4" + resolved "https://registry.yarnpkg.com/@babel/highlight/-/highlight-7.23.4.tgz#edaadf4d8232e1a961432db785091207ead0621b" + integrity sha512-acGdbYSfp2WheJoJm/EBBBLh/ID8KDc64ISZ9DYtBmC8/Q204PZJLHyzeB5qMzJ5trcOkybd78M4x2KWsUq++A== + dependencies: + "@babel/helper-validator-identifier" "^7.22.20" + chalk "^2.4.2" + js-tokens "^4.0.0" + "@babel/parser@^7.1.0", "@babel/parser@^7.14.7", "@babel/parser@^7.20.7", "@babel/parser@^7.22.16": version "7.22.16" resolved "https://registry.npmjs.org/@babel/parser/-/parser-7.22.16.tgz#180aead7f247305cce6551bea2720934e2fa2c95" @@ -227,6 +244,11 @@ resolved "https://registry.yarnpkg.com/@babel/parser/-/parser-7.23.0.tgz#da950e622420bf96ca0d0f2909cdddac3acd8719" integrity sha512-vvPKKdMemU85V9WE/l5wZEmImpCtLqbnTvqDS2U1fJ96KrxoW7KrXhNsNCblQlg8Ck4b85yxdTyelsMUgFUXiw== +"@babel/parser@^7.22.5": + version "7.23.6" + resolved "https://registry.yarnpkg.com/@babel/parser/-/parser-7.23.6.tgz#ba1c9e512bda72a47e285ae42aff9d2a635a9e3b" + integrity sha512-Z2uID7YJ7oNvAI20O9X0bblw7Qqs8Q2hFy0R9tAfnfLkp5MW0UH9eUvnDSnFwKZ0AvgS1ucqR4KzvVHgnke1VQ== + "@babel/plugin-syntax-async-generators@^7.8.4": version "7.8.4" resolved "https://registry.npmjs.org/@babel/plugin-syntax-async-generators/-/plugin-syntax-async-generators-7.8.4.tgz#a983fb1aeb2ec3f6ed042a210f640e90e786fe0d" @@ -339,9 +361,9 @@ resolved "https://registry.yarnpkg.com/@babel/template/-/template-7.22.5.tgz#0c8c4d944509875849bd0344ff0050756eefc6ec" integrity sha512-X7yV7eiwAxdj9k94NEylvbVHLiVG1nvzCV2EAowhxLTwODV1jl9UzZ48leOC0sH7OnuHrIkllaBgneUykIcZaw== dependencies: - "@babel/code-frame" "^7.22.13" - "@babel/parser" "^7.22.15" - "@babel/types" "^7.22.15" + "@babel/code-frame" "^7.22.5" + "@babel/parser" "^7.22.5" + "@babel/types" "^7.22.5" "@babel/traverse@^7.22.15", "@babel/traverse@^7.22.19": version "7.23.2" @@ -1342,6 +1364,13 @@ resolved "https://registry.npmjs.org/@types/minimist/-/minimist-1.2.2.tgz#ee771e2ba4b3dc5b372935d549fd9617bf345b8c" integrity sha512-jhuKLIRrhvCPLqwPcx6INqmKeiA5EWrsCOPhrlFSrbrmU4ZMPjj5Ul/oLCMDO98XRUIwVm78xICz4EPCektzeQ== +"@types/multistream@4.1.3": + version "4.1.3" + resolved "https://registry.yarnpkg.com/@types/multistream/-/multistream-4.1.3.tgz#972e3666502128dc273ef15c86b2e533e373ece4" + integrity sha512-t57vmDEJOZuC0M3IrZYfCd9wolTcr3ZTCGk1iwHNosvgBX+7/SMvCGcR8wP9lidpelBZQ12crSuINOxkk0azPA== + dependencies: + "@types/node" "*" + "@types/node@*", "@types/node@^20.6.0": version "20.8.10" resolved "https://registry.yarnpkg.com/@types/node/-/node-20.8.10.tgz#a5448b895c753ae929c26ce85cab557c6d4a365e" @@ -1359,16 +1388,6 @@ resolved "https://registry.npmjs.org/@types/normalize-package-data/-/normalize-package-data-2.4.1.tgz#d3357479a0fdfdd5907fe67e17e0a85c906e1301" integrity sha512-Gj7cI7z+98M282Tqmp2K5EIsoouUEzbBJhQQzDE3jSIRk6r9gsz0oUokqIUR4u1R3dMHo0pDHM7sNOHyhulypw== -"@types/pad-left@^2.1.1": - version "2.1.3" - resolved "https://registry.yarnpkg.com/@types/pad-left/-/pad-left-2.1.3.tgz#f636e62154e95bf6660439c51fe828da918124b2" - integrity sha512-fayws3T8lGvIY3UEtqFHKSH6FS1Lepo6kd3ZTgdj8rsVIIwzr9MZJt1ZP9UGu+cdAZsJiG2d5iYxyhRXwtUB5A== - -"@types/randomatic@3.1.3": - version "3.1.3" - resolved "https://registry.npmjs.org/@types/randomatic/-/randomatic-3.1.3.tgz#5475c29e82cb8dab6c94e55e77306c8eedab2d1f" - integrity sha512-UlYMg/XxN+YMh6vAiB879yh2bhaTOU0DB1g4NGIhzlaiSf22rAVKIGTvH8HjCXu+wfFvjAWHuPG5waN4btEubw== - "@types/resolve@1.20.2": version "1.20.2" resolved "https://registry.npmjs.org/@types/resolve/-/resolve-1.20.2.tgz#97d26e00cd4a0423b4af620abecf3e6f442b7975" @@ -5136,11 +5155,6 @@ matchdep@^2.0.0: resolve "^1.4.0" stack-trace "0.0.10" -math-random@^1.0.1: - version "1.0.4" - resolved "https://registry.npmjs.org/math-random/-/math-random-1.0.4.tgz#5dd6943c938548267016d4e34f057583080c514c" - integrity sha512-rUxjysqif/BZQH2yhd5Aaq7vXMSx9NdEsQcyA07uEzIvxgI7zIr33gGsh+RU0/XjmQpCW7RsVof1vlkvQVCK5A== - memfs@4.5.0: version "4.5.0" resolved "https://registry.yarnpkg.com/memfs/-/memfs-4.5.0.tgz#03082709987760022275e0d3bc0f24545b7fe279" @@ -5584,13 +5598,6 @@ p-try@^2.0.0: resolved "https://registry.npmjs.org/p-try/-/p-try-2.2.0.tgz#cb2868540e313d61de58fafbe35ce9004d5540e6" integrity sha512-R4nPAVTAU0B9D35/Gk3uJf/7XYbQcyohSKdvAxIRSNghFl4e71hVoGnBNQz9cWaXxO2I10KTC+3jMdvvoKw6dQ== -pad-left@^2.1.0: - version "2.1.0" - resolved "https://registry.npmjs.org/pad-left/-/pad-left-2.1.0.tgz#16e6a3b2d44a8e138cb0838cc7cb403a4fc9e994" - integrity sha512-HJxs9K9AztdIQIAIa/OIazRAUW/L6B9hbQDxO4X07roW3eo9XqZc2ur9bn1StH9CnbbI9EgvejHQX7CBpCF1QA== - dependencies: - repeat-string "^1.5.4" - parent-module@^1.0.0: version "1.0.1" resolved "https://registry.npmjs.org/parent-module/-/parent-module-1.0.1.tgz#691d2709e78c79fae3a156622452d00762caaaa2" @@ -5875,15 +5882,6 @@ quick-lru@^5.1.1: resolved "https://registry.npmjs.org/quick-lru/-/quick-lru-5.1.1.tgz#366493e6b3e42a3a6885e2e99d18f80fb7a8c932" integrity sha512-WuyALRjWPDGtt/wzJiadO5AXY+8hZ80hVpe6MyivgraREW751X3SbhRvG3eLKOYN+8VEvqLcf3wdnt44Z4S4SA== -randomatic@3.1.1: - version "3.1.1" - resolved "https://registry.npmjs.org/randomatic/-/randomatic-3.1.1.tgz#b776efc59375984e36c537b2f51a1f0aff0da1ed" - integrity sha512-TuDE5KxZ0J461RVjrJZCJc+J+zCkTb1MbH9AQUq68sMhOMcy9jLcb3BrZKgp9q9Ncltdg4QVqWrH02W2EFFVYw== - dependencies: - is-number "^4.0.0" - kind-of "^6.0.0" - math-random "^1.0.1" - randombytes@^2.1.0: version "2.1.0" resolved "https://registry.npmjs.org/randombytes/-/randombytes-2.1.0.tgz#df6f84372f0270dc65cdf6291349ab7a473d4f2a" @@ -6049,7 +6047,7 @@ repeat-element@^1.1.2: resolved "https://registry.npmjs.org/repeat-element/-/repeat-element-1.1.4.tgz#be681520847ab58c7568ac75fbfad28ed42d39e9" integrity sha512-LFiNfRcSu7KK3evMyYOuCzv3L10TW7yC1G2/+StMjK8Y6Vqd2MG7r/Qjw4ghtuCOjFvlnms/iMmLqpvW/ES/WQ== -repeat-string@^1.5.4, repeat-string@^1.6.1: +repeat-string@^1.6.1: version "1.6.1" resolved "https://registry.npmjs.org/repeat-string/-/repeat-string-1.6.1.tgz#8dcae470e1c88abc2d600fff4a776286da75e637" integrity sha512-PV0dzCYDNfRi1jCDbJzpW7jNNDRuCOG/jI5ctQcGKt/clZD+YcPS3yIlWuTJMmESC8aevCFmWJy5wjAFgNqN6w== From afb40a9f5a33802897e1d5bae8305c81da7beee1 Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Fri, 5 Jan 2024 19:36:43 +0000 Subject: [PATCH 140/570] GH-39259: [JS] Remove getByteLength (#39260) * Closes: #39259 --- js/src/recordbatch.ts | 9 -- js/src/table.ts | 9 -- js/src/vector.ts | 17 +--- js/src/visitor/bytelength.ts | 164 ----------------------------------- js/test/unit/table-tests.ts | 7 -- 5 files changed, 3 insertions(+), 203 deletions(-) delete mode 100644 js/src/visitor/bytelength.ts diff --git a/js/src/recordbatch.ts b/js/src/recordbatch.ts index 1ea7c52ccf310..b9061c8b9bb04 100644 --- a/js/src/recordbatch.ts +++ b/js/src/recordbatch.ts @@ -25,7 +25,6 @@ import { instance as getVisitor } from './visitor/get.js'; import { instance as setVisitor } from './visitor/set.js'; import { instance as indexOfVisitor } from './visitor/indexof.js'; import { instance as iteratorVisitor } from './visitor/iterator.js'; -import { instance as byteLengthVisitor } from './visitor/bytelength.js'; /** @ignore */ export interface RecordBatch { @@ -150,14 +149,6 @@ export class RecordBatch { return indexOfVisitor.visit(this.data, element, offset); } - /** - * Get the size (in bytes) of a row by index. - * @param index The row index for which to compute the byteLength. - */ - public getByteLength(index: number): number { - return byteLengthVisitor.visit(this.data, index); - } - /** * Iterator for rows in this RecordBatch. */ diff --git a/js/src/table.ts b/js/src/table.ts index e719b7ca9d313..d7a6617530a8e 100644 --- a/js/src/table.ts +++ b/js/src/table.ts @@ -38,7 +38,6 @@ import { instance as getVisitor } from './visitor/get.js'; import { instance as setVisitor } from './visitor/set.js'; import { instance as indexOfVisitor } from './visitor/indexof.js'; import { instance as iteratorVisitor } from './visitor/iterator.js'; -import { instance as byteLengthVisitor } from './visitor/bytelength.js'; import { DataProps } from './data.js'; import { clampRange } from './util/vector.js'; @@ -215,13 +214,6 @@ export class Table { // @ts-ignore public indexOf(element: Struct['TValue'], offset?: number): number { return -1; } - /** - * Get the size in bytes of an element by index. - * @param index The index at which to get the byteLength. - */ - // @ts-ignore - public getByteLength(index: number): number { return 0; } - /** * Iterator for rows in this Table. */ @@ -390,7 +382,6 @@ export class Table { (proto as any)['get'] = wrapChunkedCall1(getVisitor.getVisitFn(Type.Struct)); (proto as any)['set'] = wrapChunkedCall2(setVisitor.getVisitFn(Type.Struct)); (proto as any)['indexOf'] = wrapChunkedIndexOf(indexOfVisitor.getVisitFn(Type.Struct)); - (proto as any)['getByteLength'] = wrapChunkedCall1(byteLengthVisitor.getVisitFn(Type.Struct)); return 'Table'; })(Table.prototype); } diff --git a/js/src/vector.ts b/js/src/vector.ts index 8b94b14e3fff7..a7c103bc326ee 100644 --- a/js/src/vector.ts +++ b/js/src/vector.ts @@ -36,7 +36,6 @@ import { instance as getVisitor } from './visitor/get.js'; import { instance as setVisitor } from './visitor/set.js'; import { instance as indexOfVisitor } from './visitor/indexof.js'; import { instance as iteratorVisitor } from './visitor/iterator.js'; -import { instance as byteLengthVisitor } from './visitor/bytelength.js'; // @ts-ignore import type { vectorFromArray } from './factories.js'; @@ -56,7 +55,7 @@ export interface Vector { [Symbol.isConcatSpreadable]: true; } -const visitorsByTypeId = {} as { [typeId: number]: { get: any; set: any; indexOf: any; byteLength: any } }; +const visitorsByTypeId = {} as { [typeId: number]: { get: any; set: any; indexOf: any } }; const vectorPrototypesByTypeId = {} as { [typeId: number]: any }; /** @@ -76,14 +75,13 @@ export class Vector { case 0: this._offsets = [0]; break; case 1: { // special case for unchunked vectors - const { get, set, indexOf, byteLength } = visitorsByTypeId[type.typeId]; + const { get, set, indexOf } = visitorsByTypeId[type.typeId]; const unchunkedData = data[0]; this.isValid = (index: number) => isChunkedValid(unchunkedData, index); this.get = (index: number) => get(unchunkedData, index); this.set = (index: number, value: T) => set(unchunkedData, index, value); this.indexOf = (index: number) => indexOf(unchunkedData, index); - this.getByteLength = (index: number) => byteLength(unchunkedData, index); this._offsets = [0, unchunkedData.length]; break; } @@ -200,13 +198,6 @@ export class Vector { return this.indexOf(element, offset) > -1; } - /** - * Get the size in bytes of an element by index. - * @param index The index at which to get the byteLength. - */ - // @ts-ignore - public getByteLength(index: number): number { return 0; } - /** * Iterator for the Vector's elements. */ @@ -366,15 +357,13 @@ export class Vector { const get = getVisitor.getVisitFnByTypeId(typeId); const set = setVisitor.getVisitFnByTypeId(typeId); const indexOf = indexOfVisitor.getVisitFnByTypeId(typeId); - const byteLength = byteLengthVisitor.getVisitFnByTypeId(typeId); - visitorsByTypeId[typeId] = { get, set, indexOf, byteLength }; + visitorsByTypeId[typeId] = { get, set, indexOf }; vectorPrototypesByTypeId[typeId] = Object.create(proto, { ['isValid']: { value: wrapChunkedCall1(isChunkedValid) }, ['get']: { value: wrapChunkedCall1(getVisitor.getVisitFnByTypeId(typeId)) }, ['set']: { value: wrapChunkedCall2(setVisitor.getVisitFnByTypeId(typeId)) }, ['indexOf']: { value: wrapChunkedIndexOf(indexOfVisitor.getVisitFnByTypeId(typeId)) }, - ['getByteLength']: { value: wrapChunkedCall1(byteLengthVisitor.getVisitFnByTypeId(typeId)) }, }); } diff --git a/js/src/visitor/bytelength.ts b/js/src/visitor/bytelength.ts deleted file mode 100644 index 43399b2571fe2..0000000000000 --- a/js/src/visitor/bytelength.ts +++ /dev/null @@ -1,164 +0,0 @@ -/* istanbul ignore file */ - -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -/* eslint-disable unicorn/no-array-callback-reference */ - -import { Data } from '../data.js'; -import { Visitor } from '../visitor.js'; -import { TypeToDataType } from '../interfaces.js'; -import { Type, TimeUnit, UnionMode } from '../enum.js'; -import { - DataType, Dictionary, - Float, Int, Date_, Interval, Time, Timestamp, Duration, - Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, - List, FixedSizeList, Map_, Struct, Union, DenseUnion, SparseUnion, -} from '../type.js'; -import { bigIntToNumber } from '../util/bigint.js'; - -/** @ignore */ const sum = (x: number, y: number) => x + y; - -/** @ignore */ -export interface GetByteLengthVisitor extends Visitor { - visit(node: Data, index: number): number; - visitMany(nodes: Data[], index: number[]): number[]; - getVisitFn(node: Data | T): (data: Data, index: number) => number; - getVisitFn(node: T): (data: Data>, index: number) => number; - visitBinary(data: Data, index: number): number; - visitLargeBinary(data: Data, index: number): number; - visitUtf8(data: Data, index: number): number; - visitLargeUtf8(data: Data, index: number): number; - visitList(data: Data, index: number): number; - visitDenseUnion(data: Data, index: number): number; - visitSparseUnion(data: Data, index: number): number; - visitFixedSizeList(data: Data, index: number): number; -} - -/** @ignore */ -export class GetByteLengthVisitor extends Visitor { - public visitNull(____: Data, _: number) { - return 0; - } - public visitInt(data: Data, _: number) { - return data.type.bitWidth / 8; - } - public visitFloat(data: Data, _: number) { - return data.type.ArrayType.BYTES_PER_ELEMENT; - } - public visitBool(____: Data, _: number) { - return 1 / 8; - } - public visitDecimal(data: Data, _: number) { - return data.type.bitWidth / 8; - } - public visitDate(data: Data, _: number) { - return (data.type.unit + 1) * 4; - } - public visitTime(data: Data